From 4fde3ef2acef11738c83f6ad9147eba03b5da6d1 Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Tue, 23 Jan 2024 12:46:05 -0600 Subject: [PATCH] math/big,crypto/internal/bigmod: unroll loop in addMulVVW for ppc64x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This updates the assembly implementation of AddMulVVW to unroll the main loop to do 64 bytes at a time. The code for addMulVVWx is based on the same code and has also been updated to improve performance. goos: linux goarch: ppc64le pkg: crypto/internal/bigmod cpu: POWER10 │ bg.orig.out │ bg.out │ │ sec/op │ sec/op vs base │ ModAdd 116.3n ± 0% 116.9n ± 0% +0.52% (p=0.002 n=6) ModSub 111.5n ± 0% 111.5n ± 0% 0.00% (p=0.273 n=6) MontgomeryRepr 2.195µ ± 0% 1.944µ ± 0% -11.44% (p=0.002 n=6) MontgomeryMul 2.195µ ± 0% 1.943µ ± 0% -11.48% (p=0.002 n=6) ModMul 4.418µ ± 0% 3.900µ ± 0% -11.72% (p=0.002 n=6) ExpBig 5.736m ± 0% 5.117m ± 0% -10.78% (p=0.002 n=6) Exp 5.891m ± 0% 5.237m ± 0% -11.11% (p=0.002 n=6) geomean 9.901µ 9.094µ -8.15% goos: linux goarch: ppc64le pkg: math/big cpu: POWER10 │ am.orig.out │ am.out │ │ sec/op │ sec/op vs base │ AddMulVVW/1 4.456n ± 1% 3.565n ± 0% -20.00% (p=0.002 n=6) AddMulVVW/2 4.875n ± 1% 5.938n ± 1% +21.79% (p=0.002 n=6) AddMulVVW/3 5.484n ± 0% 5.693n ± 0% +3.80% (p=0.002 n=6) AddMulVVW/4 6.370n ± 0% 6.065n ± 0% -4.79% (p=0.002 n=6) AddMulVVW/5 7.321n ± 0% 7.188n ± 0% -1.82% (p=0.002 n=6) AddMulVVW/10 12.26n ± 8% 11.41n ± 0% -6.97% (p=0.002 n=6) AddMulVVW/100 100.70n ± 0% 93.58n ± 0% -7.08% (p=0.002 n=6) AddMulVVW/1000 938.6n ± 0% 845.5n ± 0% -9.92% (p=0.002 n=6) AddMulVVW/10000 9.459µ ± 0% 8.415µ ± 0% -11.04% (p=0.002 n=6) AddMulVVW/100000 94.57µ ± 0% 84.01µ ± 0% -11.16% (p=0.002 n=6) geomean 75.17n 71.21n -5.27% Change-Id: Idd79f5f02387564f4c2cc28d50b1c12bcd9a400f Reviewed-on: https://go-review.googlesource.com/c/go/+/557915 TryBot-Result: Gopher Robot Reviewed-by: Paul Murphy LUCI-TryBot-Result: Go LUCI Reviewed-by: Cherry Mui Reviewed-by: Carlos Amedee Run-TryBot: Lynn Boger Reviewed-by: Filippo Valsorda --- src/crypto/internal/bigmod/nat_ppc64x.s | 87 ++++++++++++++-------- src/math/big/arith_ppc64x.s | 95 ++++++++++++++++++------- 2 files changed, 130 insertions(+), 52 deletions(-) diff --git a/src/crypto/internal/bigmod/nat_ppc64x.s b/src/crypto/internal/bigmod/nat_ppc64x.s index 974f4f945e..94260ca29f 100644 --- a/src/crypto/internal/bigmod/nat_ppc64x.s +++ b/src/crypto/internal/bigmod/nat_ppc64x.s @@ -8,44 +8,75 @@ // func addMulVVW1024(z, x *uint, y uint) (c uint) TEXT ·addMulVVW1024(SB), $0-32 - MOVD $16, R22 // R22 = z_len - JMP addMulVVWx(SB) + MOVD $4, R6 // R6 = z_len/4 + JMP addMulVVWx<>(SB) // func addMulVVW1536(z, x *uint, y uint) (c uint) TEXT ·addMulVVW1536(SB), $0-32 - MOVD $24, R22 // R22 = z_len - JMP addMulVVWx(SB) + MOVD $6, R6 // R6 = z_len/4 + JMP addMulVVWx<>(SB) // func addMulVVW2048(z, x *uint, y uint) (c uint) TEXT ·addMulVVW2048(SB), $0-32 - MOVD $32, R22 // R22 = z_len - JMP addMulVVWx(SB) + MOVD $8, R6 // R6 = z_len/4 + JMP addMulVVWx<>(SB) -TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0 - MOVD z+0(FP), R10 // R10 = z[] - MOVD x+8(FP), R8 // R8 = x[] - MOVD y+16(FP), R9 // R9 = y +// This local function expects to be called only by +// callers above. R6 contains the z length/4 +// since 4 values are processed for each +// loop iteration, and is guaranteed to be > 0. +// If other callers are added this function might +// need to change. +TEXT addMulVVWx<>(SB), NOSPLIT, $0 + MOVD z+0(FP), R3 + MOVD x+8(FP), R4 + MOVD y+16(FP), R5 - MOVD R0, R3 // R3 will be the index register - CMP R0, R22 - MOVD R0, R4 // R4 = c = 0 - MOVD R22, CTR // Initialize loop counter - BEQ done - PCALIGN $16 + MOVD $0, R9 // R9 = c = 0 + MOVD R6, CTR // Initialize loop counter + PCALIGN $16 loop: - MOVD (R8)(R3), R20 // Load x[i] - MOVD (R10)(R3), R21 // Load z[i] - MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) - MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) - ADDC R21, R6 // R6 = z0 - ADDZE R7 // R7 = z1 - ADDC R4, R6 // R6 = z0 + c + 0 - ADDZE R7, R4 // c += z1 - MOVD R6, (R10)(R3) // Store z[i] - ADD $8, R3 - BC 16, 0, loop // bdnz + MOVD 0(R4), R14 // x[i] + MOVD 8(R4), R16 // x[i+1] + MOVD 16(R4), R18 // x[i+2] + MOVD 24(R4), R20 // x[i+3] + MOVD 0(R3), R15 // z[i] + MOVD 8(R3), R17 // z[i+1] + MOVD 16(R3), R19 // z[i+2] + MOVD 24(R3), R21 // z[i+3] + MULLD R5, R14, R10 // low x[i]*y + MULHDU R5, R14, R11 // high x[i]*y + ADDC R15, R10 + ADDZE R11 + ADDC R9, R10 + ADDZE R11, R9 + MULLD R5, R16, R14 // low x[i+1]*y + MULHDU R5, R16, R15 // high x[i+1]*y + ADDC R17, R14 + ADDZE R15 + ADDC R9, R14 + ADDZE R15, R9 + MULLD R5, R18, R16 // low x[i+2]*y + MULHDU R5, R18, R17 // high x[i+2]*y + ADDC R19, R16 + ADDZE R17 + ADDC R9, R16 + ADDZE R17, R9 + MULLD R5, R20, R18 // low x[i+3]*y + MULHDU R5, R20, R19 // high x[i+3]*y + ADDC R21, R18 + ADDZE R19 + ADDC R9, R18 + ADDZE R19, R9 + MOVD R10, 0(R3) // z[i] + MOVD R14, 8(R3) // z[i+1] + MOVD R16, 16(R3) // z[i+2] + MOVD R18, 24(R3) // z[i+3] + ADD $32, R3 + ADD $32, R4 + BDNZ loop done: - MOVD R4, c+24(FP) + MOVD R9, c+24(FP) RET diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index 9512a12270..c483e252ab 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -599,33 +599,80 @@ done: // func addMulVVW(z, x []Word, y Word) (c Word) TEXT ·addMulVVW(SB), NOSPLIT, $0 - MOVD z+0(FP), R10 // R10 = z[] - MOVD x+24(FP), R8 // R8 = x[] - MOVD y+48(FP), R9 // R9 = y - MOVD z_len+8(FP), R22 // R22 = z_len - - MOVD R0, R3 // R3 will be the index register - CMP R0, R22 - MOVD R0, R4 // R4 = c = 0 - MOVD R22, CTR // Initialize loop counter - BEQ done - PCALIGN $16 + MOVD z+0(FP), R3 // R3 = z[] + MOVD x+24(FP), R4 // R4 = x[] + MOVD y+48(FP), R5 // R5 = y + MOVD z_len+8(FP), R6 // R6 = z_len + + CMP R6, $4 + MOVD R0, R9 // R9 = c = 0 + BLT tail + SRD $2, R6, R7 + MOVD R7, CTR // Initialize loop counter + PCALIGN $16 loop: - MOVD (R8)(R3), R20 // Load x[i] - MOVD (R10)(R3), R21 // Load z[i] - MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) - MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) - ADDC R21, R6 // R6 = z0 - ADDZE R7 // R7 = z1 - ADDC R4, R6 // R6 = z0 + c + 0 - ADDZE R7, R4 // c += z1 - MOVD R6, (R10)(R3) // Store z[i] - ADD $8, R3 - BC 16, 0, loop // bdnz + MOVD 0(R4), R14 // x[i] + MOVD 8(R4), R16 // x[i+1] + MOVD 16(R4), R18 // x[i+2] + MOVD 24(R4), R20 // x[i+3] + MOVD 0(R3), R15 // z[i] + MOVD 8(R3), R17 // z[i+1] + MOVD 16(R3), R19 // z[i+2] + MOVD 24(R3), R21 // z[i+3] + MULLD R5, R14, R10 // low x[i]*y + MULHDU R5, R14, R11 // high x[i]*y + ADDC R15, R10 + ADDZE R11 + ADDC R9, R10 + ADDZE R11, R9 + MULLD R5, R16, R14 // low x[i+1]*y + MULHDU R5, R16, R15 // high x[i+1]*y + ADDC R17, R14 + ADDZE R15 + ADDC R9, R14 + ADDZE R15, R9 + MULLD R5, R18, R16 // low x[i+2]*y + MULHDU R5, R18, R17 // high x[i+2]*y + ADDC R19, R16 + ADDZE R17 + ADDC R9, R16 + ADDZE R17, R9 + MULLD R5, R20, R18 // low x[i+3]*y + MULHDU R5, R20, R19 // high x[i+3]*y + ADDC R21, R18 + ADDZE R19 + ADDC R9, R18 + ADDZE R19, R9 + MOVD R10, 0(R3) // z[i] + MOVD R14, 8(R3) // z[i+1] + MOVD R16, 16(R3) // z[i+2] + MOVD R18, 24(R3) // z[i+3] + ADD $32, R3 + ADD $32, R4 + BDNZ loop + + ANDCC $3, R6 +tail: + CMP R0, R6 + BEQ done + MOVD R6, CTR + PCALIGN $16 +tailloop: + MOVD 0(R4), R14 + MOVD 0(R3), R15 + MULLD R5, R14, R10 + MULHDU R5, R14, R11 + ADDC R15, R10 + ADDZE R11 + ADDC R9, R10 + ADDZE R11, R9 + MOVD R10, 0(R3) + ADD $8, R3 + ADD $8, R4 + BDNZ tailloop done: - MOVD R4, c+56(FP) + MOVD R9, c+56(FP) RET - -- 2.48.1