From 9ccf5b8e86ce98494a2127196fbc47d72b0a71a5 Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Fri, 8 Apr 2022 13:50:00 -0500
Subject: [PATCH] runtime: improve memmove for ppc64x
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This improves performance of memmove for larger moves by
unrolling the main loop from 32 byte to 64 byte moves.

The improvement of the relevant sizes on a power9:

Memmove/64      5.11ns Â± 0%    5.00ns Â± 0%   -2.21%
Memmove/128     8.26ns Â± 0%    5.88ns Â± 0%  -28.83%
Memmove/256     12.7ns Â± 0%     8.6ns Â± 0%  -31.94%
Memmove/512     17.9ns Â± 0%    14.3ns Â± 0%  -19.87%
Memmove/1024    33.3ns Â± 0%    27.0ns Â± 0%  -18.92%
Memmove/2048    72.1ns Â± 0%    51.8ns Â± 0%  -28.25%
Memmove/4096     126ns Â± 0%     110ns Â± 0%  -12.63%


Change-Id: I74162a9f152d7752a8281da1b89a66da99a3fdc9
Reviewed-on: https://go-review.googlesource.com/c/go/+/399499
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
---
 src/runtime/memmove_ppc64x.s | 56 +++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 17 deletions(-)
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index 25101a28c7..5fa51c0a4c 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -24,8 +24,12 @@
 #define IDX16 R8
 // temp used for copies, etc.
 #define TMP R9
-// number of 32 byte chunks
+// number of 64 byte chunks
 #define QWORDS R10
+// index values
+#define IDX32 R14
+#define IDX48 R15
+#define OCTWORDS R16
 
 TEXT runtimeÂ·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
 	// R3 = TGT = to
@@ -52,28 +56,46 @@ check:
 	// Copying forward if no overlap.
 
 	BC	12, 6, checkbytes	// BEQ CR1, checkbytes
-	SRDCC	$2, DWORDS, QWORDS	// 32 byte chunks?
-	BEQ	lt32gt8			// < 32 bytes
+	SRDCC	$3, DWORDS, OCTWORDS	// 64 byte chunks?
+	MOVD	$16, IDX16
+	BEQ	lt64gt8			// < 64 bytes
 
-	// Prepare for moves of 32 bytes at a time.
+	// Prepare for moves of 64 bytes at a time.
 
-forward32setup:
+forward64setup:
 	DCBTST	(TGT)			// prepare data cache
 	DCBT	(SRC)
-	MOVD	QWORDS, CTR		// Number of 32 byte chunks
-	MOVD	$16, IDX16		// 16 for index
+	MOVD	OCTWORDS, CTR		// Number of 64 byte chunks
+	MOVD	$32, IDX32
+	MOVD	$48, IDX48
+	PCALIGN	$32
 
-forward32:
-	LXVD2X	(R0)(SRC), VS32		// load 16 bytes
-	LXVD2X	(IDX16)(SRC), VS33	// load 16 bytes
-	ADD	$32, SRC
-	STXVD2X	VS32, (R0)(TGT)		// store 16 bytes
+forward64:
+	LXVD2X	(R0)(SRC), VS32		// load 64 bytes
+	LXVD2X	(IDX16)(SRC), VS33
+	LXVD2X	(IDX32)(SRC), VS34
+	LXVD2X	(IDX48)(SRC), VS35
+	ADD	$64, SRC
+	STXVD2X	VS32, (R0)(TGT)		// store 64 bytes
 	STXVD2X	VS33, (IDX16)(TGT)
-	ADD	$32,TGT			// bump up for next set
-	BC	16, 0, forward32	// continue
-	ANDCC	$3, DWORDS		// remaining doublewords
+	STXVD2X	VS34, (IDX32)(TGT)
+	STXVD2X VS35, (IDX48)(TGT)
+	ADD	$64,TGT			// bump up for next set
+	BC	16, 0, forward64	// continue
+	ANDCC	$7, DWORDS		// remaining doublewords
 	BEQ	checkbytes		// only bytes remain
 
+lt64gt8:
+	CMP	DWORDS, $4
+	BLT	lt32gt8
+	LXVD2X	(R0)(SRC), VS32
+	LXVD2X	(IDX16)(SRC), VS33
+	ADD	$-4, DWORDS
+	STXVD2X	VS32, (R0)(TGT)
+	STXVD2X	VS33, (IDX16)(TGT)
+	ADD	$32, SRC
+	ADD	$32, TGT
+
 lt32gt8:
         // At this point >= 8 and < 32
 	// Move 16 bytes if possible
@@ -134,7 +156,7 @@ backwardtailloop:
 	SUB	$1,SRC
 	MOVBZ 	TMP, -1(TGT)
 	SUB	$1,TGT
-	BC	16, 0, backwardtailloop // bndz
+	BDNZ	backwardtailloop
 
 nobackwardtail:
 	BC	4, 5, LR		// blelr cr1, return if DWORDS == 0
@@ -169,6 +191,6 @@ backward32loop:
 	LXVD2X	(IDX16)(SRC), VS33
 	STXVD2X	VS32, (R0)(TGT)		// store 16x2 bytes
 	STXVD2X	VS33, (IDX16)(TGT)
-	BC      16, 0, backward32loop	// bndz
+	BDNZ	backward32loop
 	BC	12, 2, LR		// beqlr, return if DWORDS == 0
 	BR	backward24
-- 
2.50.0