From 932330fdbf669e28748227148f3f633620a5a300 Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Tue, 1 Nov 2022 10:27:06 -0500 Subject: [PATCH] math/big: add PCALIGN to addMulVVW asm on ppc64x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Adding PCALIGN to addMulVVW assembler implementation provides the following improvement on power10: AddMulVVW/1 3.36ns ± 0% 3.37ns ± 0% +0.20% AddMulVVW/2 4.45ns ± 0% 4.44ns ± 0% -0.25% AddMulVVW/3 5.44ns ± 0% 5.49ns ± 0% +0.84% AddMulVVW/4 6.43ns ± 0% 6.34ns ± 0% -1.33% AddMulVVW/5 7.87ns ± 0% 7.73ns ± 0% -1.70% AddMulVVW/10 13.4ns ± 3% 12.4ns ± 7% -7.07% AddMulVVW/100 112ns ± 0% 102ns ± 0% -9.34% AddMulVVW/1000 1.09µs ± 0% 0.95µs ± 0% -13.15% AddMulVVW/10000 10.9µs ± 0% 9.6µs ± 0% -12.46% AddMulVVW/100000 109µs ± 0% 95µs ± 0% -12.58% Change-Id: Ic33d4f125c84d568f63e17cf99dc4df5ca9328d9 Reviewed-on: https://go-review.googlesource.com/c/go/+/447236 TryBot-Result: Gopher Robot Reviewed-by: Bryan Mills Reviewed-by: Russ Cox Run-TryBot: Lynn Boger Reviewed-by: Paul Murphy Reviewed-by: Archana Ravindar --- src/math/big/arith_ppc64x.s | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index a83696a0cb..5fdbf40a24 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -44,6 +44,8 @@ TEXT ·addVV(SB), NOSPLIT, $0 // for small values of z_len (0.90x in the worst case), but // gain significant performance as z_len increases (up to // 1.45x). + + PCALIGN $32 loop: MOVD 8(R8), R11 // R11 = x[i] MOVD 16(R8), R12 // R12 = x[i+1] @@ -131,6 +133,8 @@ TEXT ·subVV(SB), NOSPLIT, $0 // for small values of z_len (0.92x in the worst case), but // gain significant performance as z_len increases (up to // 1.45x). + + PCALIGN $32 loop: MOVD 8(R8), R11 // R11 = x[i] MOVD 16(R8), R12 // R12 = x[i+1] @@ -212,6 +216,7 @@ TEXT ·addVW(SB), NOSPLIT, $0 CMP R0, R9 MOVD R9, CTR // Set up the loop counter BEQ tail // If R9 = 0, we can't use the loop + PCALIGN $32 loop: MOVD 8(R8), R20 // R20 = x[i] @@ -288,6 +293,8 @@ TEXT ·subVW(SB), NOSPLIT, $0 // The loop here is almost the same as the one used in s390x, but // we don't need to capture CA every iteration because we've already // done that above. + + PCALIGN $32 loop: MOVD 8(R8), R20 MOVD 16(R8), R21 @@ -358,6 +365,7 @@ TEXT ·shlVU(SB), NOSPLIT, $0 CMP R5, R0 // iterate from i=len(z)-1 to 0 BEQ loopexit // Already at end? MOVD 0(R15),R10 // x[i] + PCALIGN $32 shloop: SLD R9, R10, R10 // x[i]<