]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/cipher: add VSX implementation of xorBytes for ppc64x
authorCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Wed, 19 Sep 2018 21:37:08 +0000 (18:37 -0300)
committerCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Thu, 1 Nov 2018 15:56:48 +0000 (15:56 +0000)
This change adds asm implementations of xorBytes for ppc64x that
takes advantage of VSX registers and instructions.

name                   old time/op    new time/op     delta
XORBytes/8Bytes-8        16.4ns ± 0%     11.1ns ± 0%   -32.32%  (p=0.000 n=5+4)
XORBytes/128Bytes-8      45.6ns ± 0%     16.2ns ± 0%   -64.50%  (p=0.008 n=5+5)
XORBytes/2048Bytes-8      433ns ±13%      129ns ± 1%   -70.29%  (p=0.000 n=5+4)
XORBytes/32768Bytes-8    7.16µs ± 0%     1.83µs ± 0%   -74.39%  (p=0.008 n=5+5)

name                   old speed      new speed       delta
XORBytes/8Bytes-8       488MB/s ± 0%    721MB/s ± 0%   +47.75%  (p=0.016 n=5+4)
XORBytes/128Bytes-8    2.80GB/s ± 0%   7.89GB/s ± 0%  +181.33%  (p=0.008 n=5+5)
XORBytes/2048Bytes-8   4.77GB/s ±13%  15.87GB/s ± 0%  +232.68%  (p=0.016 n=5+4)
XORBytes/32768Bytes-8  4.58GB/s ± 0%  17.88GB/s ± 0%  +290.47%  (p=0.008 n=5+5)

Change-Id: Ic27d9b858f8ec2d597fdabc68a288d6844eba701
Reviewed-on: https://go-review.googlesource.com/c/145997
Run-TryBot: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
src/crypto/cipher/xor_generic.go
src/crypto/cipher/xor_ppc64x.go [new file with mode: 0644]
src/crypto/cipher/xor_ppc64x.s [new file with mode: 0644]

index 4d660b0a75a50d95bb26742977d310987758bfe1..b7de60873c5608223b7fc3d34f239d4243bd39fd 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64
+// +build !amd64,!ppc64,!ppc64le
 
 package cipher
 
diff --git a/src/crypto/cipher/xor_ppc64x.go b/src/crypto/cipher/xor_ppc64x.go
new file mode 100644 (file)
index 0000000..8d2e43d
--- /dev/null
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+package cipher
+
+// xorBytes xors the bytes in a and b. The destination should have enough
+// space, otherwise xorBytes will panic. Returns the number of bytes xor'd.
+func xorBytes(dst, a, b []byte) int {
+       n := len(a)
+       if len(b) < n {
+               n = len(b)
+       }
+       if n == 0 {
+               return 0
+       }
+       _ = dst[n-1]
+       xorBytesVSX(&dst[0], &a[0], &b[0], n)
+       return n
+}
+
+func xorWords(dst, a, b []byte) {
+       xorBytes(dst, a, b)
+}
+
+//go:noescape
+func xorBytesVSX(dst, a, b *byte, n int)
diff --git a/src/crypto/cipher/xor_ppc64x.s b/src/crypto/cipher/xor_ppc64x.s
new file mode 100644 (file)
index 0000000..af4d08b
--- /dev/null
@@ -0,0 +1,66 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "textflag.h"
+
+// func xorBytesVSX(dst, a, b *byte, n int)
+TEXT ·xorBytesVSX(SB), NOSPLIT, $0
+       MOVD    dst+0(FP), R3   // R3 = dst
+       MOVD    a+8(FP), R4     // R4 = a
+       MOVD    b+16(FP), R5    // R5 = b
+       MOVD    n+24(FP), R6    // R6 = n
+
+       CMPU    R6, $16, CR7    // Check if n ≥ 16 bytes
+       MOVD    R0, R8          // R8 = index
+       CMPU    R6, $8, CR6     // Check if 8 ≤ n < 16 bytes
+       BGE     CR7, preloop16
+       BLT     CR6, small
+
+       // Case for 8 ≤ n < 16 bytes
+       MOVD    (R4)(R8), R14   // R14 = a[i,...,i+7]
+       MOVD    (R5)(R8), R15   // R15 = b[i,...,i+7]
+       XOR     R14, R15, R16   // R16 = a[] ^ b[]
+       SUB     $8, R6          // n = n - 8
+       MOVD    R16, (R3)(R8)   // Store to dst
+       ADD     $8, R8
+
+       // Check if we're finished
+       CMP     R6, R0
+       BGT     small
+       JMP     done
+
+       // Case for n ≥ 16 bytes
+preloop16:
+       SRD     $4, R6, R7      // Setup loop counter
+       MOVD    R7, CTR
+       ANDCC   $15, R6, R9     // Check for tailing bytes for later
+loop16:
+       LXVD2X          (R4)(R8), VS32          // VS32 = a[i,...,i+15]
+       LXVD2X          (R5)(R8), VS33          // VS33 = b[i,...,i+15]
+       XXLXOR          VS32, VS33, VS34        // VS34 = a[] ^ b[]
+       STXVD2X         VS34, (R3)(R8)          // Store to dst
+       ADD             $16, R8                 // Update index
+       BC              16, 0, loop16           // bdnz loop16
+
+       BEQ             CR0, done
+       SLD             $4, R7
+       SUB             R7, R6                  // R6 = n - (R7 * 16)
+
+       // Case for n < 8 bytes and tailing bytes from the
+       // previous cases.
+small:
+       MOVD    R6, CTR         // Setup loop counter
+
+loop:
+       MOVBZ   (R4)(R8), R14   // R14 = a[i]
+       MOVBZ   (R5)(R8), R15   // R15 = b[i]
+       XOR     R14, R15, R16   // R16 = a[i] ^ b[i]
+       MOVB    R16, (R3)(R8)   // Store to dst
+       ADD     $1, R8
+       BC      16, 0, loop     // bdnz loop
+
+done:
+       RET