From 932330fdbf669e28748227148f3f633620a5a300 Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Tue, 1 Nov 2022 10:27:06 -0500
Subject: [PATCH] math/big: add PCALIGN to addMulVVW asm on ppc64x
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Adding PCALIGN to addMulVVW assembler implementation
provides the following improvement on power10:

    AddMulVVW/1         3.36ns Â± 0%    3.37ns Â± 0%   +0.20%
    AddMulVVW/2         4.45ns Â± 0%    4.44ns Â± 0%   -0.25%
    AddMulVVW/3         5.44ns Â± 0%    5.49ns Â± 0%   +0.84%
    AddMulVVW/4         6.43ns Â± 0%    6.34ns Â± 0%   -1.33%
    AddMulVVW/5         7.87ns Â± 0%    7.73ns Â± 0%   -1.70%
    AddMulVVW/10        13.4ns Â± 3%    12.4ns Â± 7%   -7.07%
    AddMulVVW/100        112ns Â± 0%     102ns Â± 0%   -9.34%
    AddMulVVW/1000      1.09Âµs Â± 0%    0.95Âµs Â± 0%  -13.15%
    AddMulVVW/10000     10.9Âµs Â± 0%     9.6Âµs Â± 0%  -12.46%
    AddMulVVW/100000     109Âµs Â± 0%      95Âµs Â± 0%  -12.58%

Change-Id: Ic33d4f125c84d568f63e17cf99dc4df5ca9328d9
Reviewed-on: https://go-review.googlesource.com/c/go/+/447236
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
Reviewed-by: Russ Cox <rsc@golang.org>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Archana Ravindar <ravindararchana@gmail.com>
---
 src/math/big/arith_ppc64x.s | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
index a83696a0cb..5fdbf40a24 100644
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -44,6 +44,8 @@ TEXT Â·addVV(SB), NOSPLIT, $0
 	// for small values of z_len (0.90x in the worst case), but
 	// gain significant performance as z_len increases (up to
 	// 1.45x).
+
+	PCALIGN $32
 loop:
 	MOVD  8(R8), R11      // R11 = x[i]
 	MOVD  16(R8), R12     // R12 = x[i+1]
@@ -131,6 +133,8 @@ TEXT Â·subVV(SB), NOSPLIT, $0
 	// for small values of z_len (0.92x in the worst case), but
 	// gain significant performance as z_len increases (up to
 	// 1.45x).
+
+	PCALIGN $32
 loop:
 	MOVD  8(R8), R11      // R11 = x[i]
 	MOVD  16(R8), R12     // R12 = x[i+1]
@@ -212,6 +216,7 @@ TEXT Â·addVW(SB), NOSPLIT, $0
 	CMP   R0, R9
 	MOVD  R9, CTR		// Set up the loop counter
 	BEQ   tail		// If R9 = 0, we can't use the loop
+	PCALIGN $32
 
 loop:
 	MOVD  8(R8), R20	// R20 = x[i]
@@ -288,6 +293,8 @@ TEXT Â·subVW(SB), NOSPLIT, $0
 	// The loop here is almost the same as the one used in s390x, but
 	// we don't need to capture CA every iteration because we've already
 	// done that above.
+
+	PCALIGN $32
 loop:
 	MOVD  8(R8), R20
 	MOVD  16(R8), R21
@@ -358,6 +365,7 @@ TEXT Â·shlVU(SB), NOSPLIT, $0
 	CMP     R5, R0          // iterate from i=len(z)-1 to 0
 	BEQ     loopexit        // Already at end?
 	MOVD	0(R15),R10	// x[i]
+	PCALIGN $32
 shloop:
 	SLD     R9, R10, R10    // x[i]<<s
 	MOVDU   -8(R15), R14
@@ -520,6 +528,7 @@ TEXT Â·mulAddVWW(SB), NOSPLIT, $0
 	CMP     R0, R14
 	MOVD    R14, CTR          // Set up the loop counter
 	BEQ     tail              // If R9 = 0, we can't use the loop
+	PCALIGN $32
 
 loop:
 	MOVD    8(R8), R20        // R20 = x[i]
@@ -602,6 +611,7 @@ TEXT Â·addMulVVW(SB), NOSPLIT, $0
 	MOVD R0, R4		// R4 = c = 0
 	MOVD R22, CTR		// Initialize loop counter
 	BEQ  done
+	PCALIGN $32
 
 loop:
 	MOVD  (R8)(R3), R20	// Load x[i]
-- 
2.52.0