From d979ac33a210d548971eac3a0ba64449dcce886f Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Mon, 16 Sep 2019 16:59:38 -0400 Subject: [PATCH] crypto/cipher: improve xorBytesVSX asm for ppc64x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This improves the performance of xorBytesVSX in crypto/cipher by unrolling the loop that does the stores. Improvement on power9: name old time/op new time/op delta XORBytes/8Bytes 17.9ns ± 0% 18.2ns ± 0% +1.53% (p=0.029 n=4+4) XORBytes/128Bytes 24.4ns ± 0% 22.5ns ± 0% -7.79% (p=0.029 n=4+4) XORBytes/2048Bytes 131ns ± 0% 109ns ± 0% -16.79% (p=0.029 n=4+4) XORBytes/32768Bytes 1.74µs ± 0% 1.43µs ± 8% -18.04% (p=0.029 n=4+4) Change-Id: I75bd625d3ae9daa7bda54c523028671ab036b13d Reviewed-on: https://go-review.googlesource.com/c/go/+/197058 Run-TryBot: Lynn Boger TryBot-Result: Gobot Gobot Reviewed-by: Carlos Eduardo Seo --- src/crypto/cipher/xor_ppc64x.s | 77 +++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/src/crypto/cipher/xor_ppc64x.s b/src/crypto/cipher/xor_ppc64x.s index af4d08bda3..4cef31d0ee 100644 --- a/src/crypto/cipher/xor_ppc64x.s +++ b/src/crypto/cipher/xor_ppc64x.s @@ -13,45 +13,66 @@ TEXT ·xorBytesVSX(SB), NOSPLIT, $0 MOVD b+16(FP), R5 // R5 = b MOVD n+24(FP), R6 // R6 = n - CMPU R6, $16, CR7 // Check if n ≥ 16 bytes + CMPU R6, $32, CR7 // Check if n ≥ 32 bytes MOVD R0, R8 // R8 = index - CMPU R6, $8, CR6 // Check if 8 ≤ n < 16 bytes - BGE CR7, preloop16 - BLT CR6, small + CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes + BLT CR6, small // Smaller than 8 + BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes - // Case for 8 ≤ n < 16 bytes - MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] - MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] - XOR R14, R15, R16 // R16 = a[] ^ b[] - SUB $8, R6 // n = n - 8 - MOVD R16, (R3)(R8) // Store to dst - ADD $8, R8 - - // Check if we're finished - CMP R6, R0 - BGT small - JMP done - - // Case for n ≥ 16 bytes -preloop16: - SRD $4, R6, R7 // Setup loop counter + // Case for n ≥ 32 bytes +preloop32: + SRD $5, R6, R7 // Setup loop counter MOVD R7, CTR - ANDCC $15, R6, R9 // Check for tailing bytes for later -loop16: + MOVD $16, R10 + ANDCC $31, R6, R9 // Check for tailing bytes for later +loop32: LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15] + LXVD2X (R4)(R10), VS34 LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15] - XXLXOR VS32, VS33, VS34 // VS34 = a[] ^ b[] - STXVD2X VS34, (R3)(R8) // Store to dst - ADD $16, R8 // Update index - BC 16, 0, loop16 // bdnz loop16 + LXVD2X (R5)(R10), VS35 + XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[] + XXLXOR VS34, VS35, VS34 + STXVD2X VS32, (R3)(R8) // Store to dst + STXVD2X VS34, (R3)(R10) + ADD $32, R8 // Update index + ADD $32, R10 + BC 16, 0, loop32 // bdnz loop16 BEQ CR0, done - SLD $4, R7 - SUB R7, R6 // R6 = n - (R7 * 16) + + MOVD R9, R6 + CMP R6, $8 + BLT small +xor16: + CMP R6, $16 + BLT xor8 + LXVD2X (R4)(R8), VS32 + LXVD2X (R5)(R8), VS33 + XXLXOR VS32, VS33, VS32 + STXVD2X VS32, (R3)(R8) + ADD $16, R8 + ADD $-16, R6 + CMP R6, $8 + BLT small +xor8: + // Case for 8 ≤ n < 16 bytes + MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] + MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] + XOR R14, R15, R16 // R16 = a[] ^ b[] + SUB $8, R6 // n = n - 8 + MOVD R16, (R3)(R8) // Store to dst + ADD $8, R8 + + // Check if we're finished + CMP R6, R0 + BGT small + RET // Case for n < 8 bytes and tailing bytes from the // previous cases. small: + CMP R6, R0 + BEQ done MOVD R6, CTR // Setup loop counter loop: -- 2.50.0