]> Cypherpunks repositories - gostls13.git/commitdiff
crypto/cipher: improve xorBytesVSX asm for ppc64x
authorLynn Boger <laboger@linux.vnet.ibm.com>
Mon, 16 Sep 2019 20:59:38 +0000 (16:59 -0400)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Tue, 24 Sep 2019 21:30:45 +0000 (21:30 +0000)
This improves the performance of xorBytesVSX in crypto/cipher by
unrolling the loop that does the stores. Improvement on power9:

name                 old time/op    new time/op    delta
XORBytes/8Bytes        17.9ns ± 0%    18.2ns ± 0%   +1.53%  (p=0.029 n=4+4)
XORBytes/128Bytes      24.4ns ± 0%    22.5ns ± 0%   -7.79%  (p=0.029 n=4+4)
XORBytes/2048Bytes      131ns ± 0%     109ns ± 0%  -16.79%  (p=0.029 n=4+4)
XORBytes/32768Bytes    1.74µs ± 0%    1.43µs ± 8%  -18.04%  (p=0.029 n=4+4)

Change-Id: I75bd625d3ae9daa7bda54c523028671ab036b13d
Reviewed-on: https://go-review.googlesource.com/c/go/+/197058
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
src/crypto/cipher/xor_ppc64x.s

index af4d08bda3b9a0c3e9eccb3a271e6a5065fdc7da..4cef31d0ee344b81453a26b592fc3f42743d3df2 100644 (file)
@@ -13,45 +13,66 @@ TEXT ·xorBytesVSX(SB), NOSPLIT, $0
        MOVD    b+16(FP), R5    // R5 = b
        MOVD    n+24(FP), R6    // R6 = n
 
-       CMPU    R6, $16, CR7    // Check if n ≥ 16 bytes
+       CMPU    R6, $32, CR7    // Check if n ≥ 32 bytes
        MOVD    R0, R8          // R8 = index
-       CMPU    R6, $8, CR6     // Check if 8 ≤ n < 16 bytes
-       BGE     CR7, preloop16
-       BLT     CR6, small
+       CMPU    R6, $8, CR6     // Check if 8 ≤ n < 32 bytes
+       BLT     CR6, small      // Smaller than 8
+       BLT     CR7, xor16      // Case for 16 ≤ n < 32 bytes
 
-       // Case for 8 ≤ n < 16 bytes
-       MOVD    (R4)(R8), R14   // R14 = a[i,...,i+7]
-       MOVD    (R5)(R8), R15   // R15 = b[i,...,i+7]
-       XOR     R14, R15, R16   // R16 = a[] ^ b[]
-       SUB     $8, R6          // n = n - 8
-       MOVD    R16, (R3)(R8)   // Store to dst
-       ADD     $8, R8
-
-       // Check if we're finished
-       CMP     R6, R0
-       BGT     small
-       JMP     done
-
-       // Case for n ≥ 16 bytes
-preloop16:
-       SRD     $4, R6, R7      // Setup loop counter
+       // Case for n ≥ 32 bytes
+preloop32:
+       SRD     $5, R6, R7      // Setup loop counter
        MOVD    R7, CTR
-       ANDCC   $15, R6, R9     // Check for tailing bytes for later
-loop16:
+       MOVD    $16, R10
+       ANDCC   $31, R6, R9     // Check for tailing bytes for later
+loop32:
        LXVD2X          (R4)(R8), VS32          // VS32 = a[i,...,i+15]
+       LXVD2X          (R4)(R10), VS34
        LXVD2X          (R5)(R8), VS33          // VS33 = b[i,...,i+15]
-       XXLXOR          VS32, VS33, VS34        // VS34 = a[] ^ b[]
-       STXVD2X         VS34, (R3)(R8)          // Store to dst
-       ADD             $16, R8                 // Update index
-       BC              16, 0, loop16           // bdnz loop16
+       LXVD2X          (R5)(R10), VS35
+       XXLXOR          VS32, VS33, VS32        // VS34 = a[] ^ b[]
+       XXLXOR          VS34, VS35, VS34
+       STXVD2X         VS32, (R3)(R8)          // Store to dst
+       STXVD2X         VS34, (R3)(R10)
+       ADD             $32, R8                 // Update index
+       ADD             $32, R10
+       BC              16, 0, loop32           // bdnz loop16
 
        BEQ             CR0, done
-       SLD             $4, R7
-       SUB             R7, R6                  // R6 = n - (R7 * 16)
+
+       MOVD            R9, R6
+       CMP             R6, $8
+       BLT             small
+xor16:
+       CMP             R6, $16
+       BLT             xor8
+       LXVD2X          (R4)(R8), VS32
+       LXVD2X          (R5)(R8), VS33
+       XXLXOR          VS32, VS33, VS32
+       STXVD2X         VS32, (R3)(R8)
+       ADD             $16, R8
+       ADD             $-16, R6
+       CMP             R6, $8
+       BLT             small
+xor8:
+       // Case for 8 ≤ n < 16 bytes
+       MOVD    (R4)(R8), R14   // R14 = a[i,...,i+7]
+       MOVD    (R5)(R8), R15   // R15 = b[i,...,i+7]
+       XOR     R14, R15, R16   // R16 = a[] ^ b[]
+       SUB     $8, R6          // n = n - 8
+       MOVD    R16, (R3)(R8)   // Store to dst
+       ADD     $8, R8
+
+       // Check if we're finished
+       CMP     R6, R0
+       BGT     small
+       RET
 
        // Case for n < 8 bytes and tailing bytes from the
        // previous cases.
 small:
+       CMP     R6, R0
+       BEQ     done
        MOVD    R6, CTR         // Setup loop counter
 
 loop: