From d979ac33a210d548971eac3a0ba64449dcce886f Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Mon, 16 Sep 2019 16:59:38 -0400
Subject: [PATCH] crypto/cipher: improve xorBytesVSX asm for ppc64x
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This improves the performance of xorBytesVSX in crypto/cipher by
unrolling the loop that does the stores. Improvement on power9:

name                 old time/op    new time/op    delta
XORBytes/8Bytes        17.9ns Â± 0%    18.2ns Â± 0%   +1.53%  (p=0.029 n=4+4)
XORBytes/128Bytes      24.4ns Â± 0%    22.5ns Â± 0%   -7.79%  (p=0.029 n=4+4)
XORBytes/2048Bytes      131ns Â± 0%     109ns Â± 0%  -16.79%  (p=0.029 n=4+4)
XORBytes/32768Bytes    1.74Âµs Â± 0%    1.43Âµs Â± 8%  -18.04%  (p=0.029 n=4+4)

Change-Id: I75bd625d3ae9daa7bda54c523028671ab036b13d
Reviewed-on: https://go-review.googlesource.com/c/go/+/197058
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
---
 src/crypto/cipher/xor_ppc64x.s | 77 +++++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 28 deletions(-)

diff --git a/src/crypto/cipher/xor_ppc64x.s b/src/crypto/cipher/xor_ppc64x.s
index af4d08bda3..4cef31d0ee 100644
--- a/src/crypto/cipher/xor_ppc64x.s
+++ b/src/crypto/cipher/xor_ppc64x.s
@@ -13,45 +13,66 @@ TEXT Â·xorBytesVSX(SB), NOSPLIT, $0
 	MOVD	b+16(FP), R5	// R5 = b
 	MOVD	n+24(FP), R6	// R6 = n
 
-	CMPU	R6, $16, CR7	// Check if n â¥ 16 bytes
+	CMPU	R6, $32, CR7	// Check if n â¥ 32 bytes
 	MOVD	R0, R8		// R8 = index
-	CMPU	R6, $8, CR6	// Check if 8 â¤ n < 16 bytes
-	BGE	CR7, preloop16
-	BLT	CR6, small
+	CMPU	R6, $8, CR6	// Check if 8 â¤ n < 32 bytes
+	BLT	CR6, small	// Smaller than 8
+	BLT	CR7, xor16	// Case for 16 â¤ n < 32 bytes
 
-	// Case for 8 â¤ n < 16 bytes
-	MOVD	(R4)(R8), R14	// R14 = a[i,...,i+7]
-	MOVD	(R5)(R8), R15	// R15 = b[i,...,i+7]
-	XOR	R14, R15, R16	// R16 = a[] ^ b[]
-	SUB	$8, R6		// n = n - 8
-	MOVD	R16, (R3)(R8)	// Store to dst
-	ADD	$8, R8
-
-	// Check if we're finished
-	CMP	R6, R0
-	BGT	small
-	JMP	done
-
-	// Case for n â¥ 16 bytes
-preloop16:
-	SRD	$4, R6, R7	// Setup loop counter
+	// Case for n â¥ 32 bytes
+preloop32:
+	SRD	$5, R6, R7	// Setup loop counter
 	MOVD	R7, CTR
-	ANDCC	$15, R6, R9	// Check for tailing bytes for later
-loop16:
+	MOVD	$16, R10
+	ANDCC	$31, R6, R9	// Check for tailing bytes for later
+loop32:
 	LXVD2X		(R4)(R8), VS32		// VS32 = a[i,...,i+15]
+	LXVD2X		(R4)(R10), VS34
 	LXVD2X		(R5)(R8), VS33		// VS33 = b[i,...,i+15]
-	XXLXOR		VS32, VS33, VS34	// VS34 = a[] ^ b[]
-	STXVD2X		VS34, (R3)(R8)		// Store to dst
-	ADD		$16, R8			// Update index
-	BC		16, 0, loop16		// bdnz loop16
+	LXVD2X		(R5)(R10), VS35
+	XXLXOR		VS32, VS33, VS32	// VS34 = a[] ^ b[]
+	XXLXOR		VS34, VS35, VS34
+	STXVD2X		VS32, (R3)(R8)		// Store to dst
+	STXVD2X		VS34, (R3)(R10)
+	ADD		$32, R8			// Update index
+	ADD		$32, R10
+	BC		16, 0, loop32		// bdnz loop16
 
 	BEQ		CR0, done
-	SLD		$4, R7
-	SUB		R7, R6			// R6 = n - (R7 * 16)
+
+	MOVD		R9, R6
+	CMP		R6, $8
+	BLT		small
+xor16:
+	CMP		R6, $16
+	BLT		xor8
+	LXVD2X		(R4)(R8), VS32
+	LXVD2X		(R5)(R8), VS33
+	XXLXOR		VS32, VS33, VS32
+	STXVD2X		VS32, (R3)(R8)
+	ADD		$16, R8
+	ADD		$-16, R6
+	CMP		R6, $8
+	BLT		small
+xor8:
+	// Case for 8 â¤ n < 16 bytes
+	MOVD    (R4)(R8), R14   // R14 = a[i,...,i+7]
+	MOVD    (R5)(R8), R15   // R15 = b[i,...,i+7]
+	XOR     R14, R15, R16   // R16 = a[] ^ b[]
+	SUB     $8, R6          // n = n - 8
+	MOVD    R16, (R3)(R8)   // Store to dst
+	ADD     $8, R8
+
+	// Check if we're finished
+	CMP     R6, R0
+	BGT     small
+	RET
 
 	// Case for n < 8 bytes and tailing bytes from the
 	// previous cases.
 small:
+	CMP	R6, R0
+	BEQ	done
 	MOVD	R6, CTR		// Setup loop counter
 
 loop:
-- 
2.50.0