From e6faa375b4804588cdeb67eff9d01add8093f6e1 Mon Sep 17 00:00:00 2001
From: Archana R <aravind5@in.ibm.com>
Date: Mon, 13 Feb 2023 06:18:59 -0600
Subject: [PATCH] runtime: improve memclr on ppc64x/power10
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Rewrite memclr asm function to use the new power10 instruction stxvl
or the store vector with length which can specify the number of bytes
to be stored in a register, thereby avoiding loops to store the tail
end bytes.
On power9 and power8 the code remains unchanged.
The performance for all sizes<16 improve on power10 with this change.

name          old time/op    new time/op     delta
Memclr/1        2.59ns Â± 1%     2.80ns Â± 9%      ~
Memclr/2        2.77ns Â± 0%     2.90ns Â± 8%      ~
Memclr/3        3.70ns Â± 1%     3.00ns Â± 1%   -19.02%
Memclr/4        4.56ns Â± 5%     2.98ns Â± 1%   -34.56%
Memclr/5        10.1ns Â± 8%      2.9ns Â± 4%   -70.82%
Memclr/6        6.99ns Â± 4%     2.84ns Â± 9%   -59.40%
Memclr/7        8.66ns Â± 5%     2.92ns Â± 4%   -66.27%
Memclr/8        2.75ns Â± 1%     2.75ns Â± 0%      ~
Memclr/9        3.43ns Â± 1%     2.75ns Â± 0%   -19.78%
Memclr/10       3.30ns Â± 0%     2.77ns Â± 0%   -15.81%
Memclr/12       5.09ns Â± 0%     2.78ns Â± 0%   -45.49%
Memclr/15       9.79ns Â± 0%     2.78ns Â± 0%   -71.64%
Memclr/16       2.65ns Â± 1%     2.77ns Â± 0%    +4.77%
GoMemclr/1      2.46ns Â± 3%     2.46ns Â± 3%      ~
GoMemclr/2      2.91ns Â± 0%     2.46ns Â± 1%   -15.38%
GoMemclr/3      4.85ns Â± 6%     2.45ns Â± 1%   -49.50%
GoMemclr/4      5.44ns Â± 5%     2.47ns Â± 1%   -54.63%
GoMemclr/5      6.48ns Â± 0%     2.45ns Â± 1%   -62.20%
GoMemclr/6      9.33ns Â±20%     2.45ns Â± 1%   -73.76%
GoMemclr/7      8.79ns Â± 2%     2.46ns Â± 1%   -72.03%
GoMemclr/8      2.91ns Â± 0%     2.84ns Â± 0%    -2.35%
GoMemclr/9      5.04ns Â± 0%     2.84ns Â± 0%   -43.67%
GoMemclr/10     5.17ns Â± 1%     2.84ns Â± 0%   -44.99%
GoMemclr/12     6.22ns Â± 0%     2.84ns Â± 0%   -54.41%
GoMemclr/15     10.1ns Â± 0%      2.8ns Â± 0%   -72.04%
GoMemclr/16     2.90ns Â± 0%     2.92ns Â± 0%    +0.83%

Change-Id: I2b7e1a9ceafba11c57c04a2e8cbd12b5ac87ec9b
Reviewed-on: https://go-review.googlesource.com/c/go/+/467635
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Than McIntosh <thanm@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
---
 src/runtime/memclr_ppc64x.s | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/runtime/memclr_ppc64x.s b/src/runtime/memclr_ppc64x.s
index 354325585d..3e569282d0 100644
--- a/src/runtime/memclr_ppc64x.s
+++ b/src/runtime/memclr_ppc64x.s
@@ -91,24 +91,33 @@ lt32gt8:
 	ADD	$16, R3
 	ADD	$-16, R4
 lt16gt8:
+#ifdef GOPPC64_power10
+	SLD	$56, R4, R7
+	STXVL   V0, R3, R7
+	RET
+#else
 	CMP	R4, $8
 	BLT	nozerolarge
 	MOVD	R0, 0(R3)
 	ADD	$8, R3
 	ADD	$-8, R4
-
+#endif
 nozerolarge:
 	ANDCC $7, R4, R5 // any remaining bytes
 	BC    4, 1, LR   // ble lr
-
-zerotail:
+#ifdef GOPPC64_power10
+	XXLXOR  VS32, VS32, VS32 // clear VS32 (V0)
+	SLD	$56, R5, R7
+	STXVL   V0, R3, R7
+	RET
+#else
 	MOVD R5, CTR // set up to clear tail bytes
-
 zerotailloop:
 	MOVB R0, 0(R3)           // clear single bytes
 	ADD  $1, R3
 	BDNZ zerotailloop // dec ctr, br zerotailloop if ctr not 0
 	RET
+#endif
 
 zero512xsetup:  // 512 chunk with extra needed
 	ANDCC $8, R3, R11    // 8 byte alignment?
@@ -123,8 +132,6 @@ zero512setup16:
 	MOVD  $128, R15
 	SUB   R14, R15, R14 // find increment to 128 alignment
 	SRD   $4, R14, R15  // number of 16 byte chunks
-
-zero512presetup:
 	MOVD   R15, CTR         // loop counter of 16 bytes
 	XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
 
@@ -142,8 +149,7 @@ zero512setup:  // setup for dcbz loop
 	MOVD $128, R9   // index regs for 128 bytes
 	MOVD $256, R10
 	MOVD $384, R11
-	PCALIGN	$32
-
+	PCALIGN $32
 zero512:
 	DCBZ (R3+R0)        // clear first chunk
 	DCBZ (R3+R9)        // clear second chunk
-- 
2.50.0