From 6e4a0d8e44c845251c01fee3923113e6ba8d1e06 Mon Sep 17 00:00:00 2001 From: kmvijay Date: Thu, 30 Oct 2025 14:50:14 +0000 Subject: [PATCH] crypto/internal/fips140/bigmod: vector implementation of addMulVVWx on s390x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit addMulVVWx assembly routine is used to multiply bignum multiplicand with a 64-bit multiplier. The new implementation for s390x architecture uses an algorithm based on vector instructions, with a significant performance improvement. Note: z13 is the minimum architecture for Go, which already has VX support. The performance improvement is as below: goos: linux goarch: s390x pkg: crypto/internal/fips140/bigmod Orig.txt Vector_Patch.txt sec/op sec/op vs base ModAdd 164.1n ± 0% 159.7n ± 0% -2.7% (p=0.000 n=10) ModSub 152.3n ± 1% 147.3n ± 0% -3.25 (p=0.000 n=10) MontgomeryRepr 4.806µ ± 3% 1.829µ ± 0% -61.94% (p=0.000 n=10) MontgomeryMul 4.812µ ± 5% 1.834µ ± 0% -61.90% (p=0.000 n=10) ModMul 9.646µ ± 3% 3.698µ ± 0% -61.67% (p=0.000 n=10) ExpBig 11.28m ± 0% 11.28m ± 0% +0.04 (p=0.035 n=10) Exp 12.284m ± 5% 5.004m ± 1% -59.26 (p=0.000 n=10) geomean 18.61µ 10.74µ -42.2 Change-Id: I679944c9dac9f43f1626b018f72efa6da0d2442d Cq-Include-Trybots: luci.golang.try:gotip-linux-s390x Reviewed-on: https://go-review.googlesource.com/c/go/+/716480 Auto-Submit: Filippo Valsorda Reviewed-by: Vishwanatha HD Reviewed-by: Cherry Mui Reviewed-by: Roland Shoemaker Reviewed-by: Filippo Valsorda Reviewed-by: Srinivas Pokala LUCI-TryBot-Result: Go LUCI --- .../internal/fips140/bigmod/nat_s390x.s | 205 ++++++++++++------ 1 file changed, 140 insertions(+), 65 deletions(-) diff --git a/src/crypto/internal/fips140/bigmod/nat_s390x.s b/src/crypto/internal/fips140/bigmod/nat_s390x.s index 0c07a0c8a6..9adeb9981d 100644 --- a/src/crypto/internal/fips140/bigmod/nat_s390x.s +++ b/src/crypto/internal/fips140/bigmod/nat_s390x.s @@ -4,82 +4,157 @@ //go:build !purego +// Register usage (z13 convention): +// R2 = rp (result pointer) +// R3 = ap (source pointer) +// R4 = an / idx (loop counter) +// R5 = b0 (multiplier limb) +// R6 = cy (carry) + #include "textflag.h" // func addMulVVW1024(z, x *uint, y uint) (c uint) TEXT ·addMulVVW1024(SB), $0-32 - MOVD $16, R5 - JMP addMulVVWx(SB) + MOVD $16, R4 + JMP addMulVVWx(SB) // func addMulVVW1536(z, x *uint, y uint) (c uint) TEXT ·addMulVVW1536(SB), $0-32 - MOVD $24, R5 - JMP addMulVVWx(SB) + MOVD $24, R4 + JMP addMulVVWx(SB) // func addMulVVW2048(z, x *uint, y uint) (c uint) TEXT ·addMulVVW2048(SB), $0-32 - MOVD $32, R5 - JMP addMulVVWx(SB) + MOVD $32, R4 + JMP addMulVVWx(SB) TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0 MOVD z+0(FP), R2 - MOVD x+8(FP), R8 - MOVD y+16(FP), R9 - - MOVD $0, R1 // i*8 = 0 - MOVD $0, R7 // i = 0 - MOVD $0, R0 // make sure it's zero - MOVD $0, R4 // c = 0 - - MOVD R5, R12 - AND $-2, R12 - CMPBGE R5, $2, A6 - BR E6 - -A6: - MOVD (R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (R2)(R1*1), R10 - ADDC R10, R11 // add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (R2)(R1*1) - - MOVD (8)(R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (8)(R2)(R1*1), R10 - ADDC R10, R11 // add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (8)(R2)(R1*1) - - ADD $16, R1 // i*8 + 8 - ADD $2, R7 // i++ - - CMPBLT R7, R12, A6 - BR E6 - -L6: - // TODO: drop unused single-step loop. - MOVD (R8)(R1*1), R6 - MULHDU R9, R6 - MOVD (R2)(R1*1), R10 - ADDC R10, R11 // add to low order bits - ADDE R0, R6 - ADDC R4, R11 - ADDE R0, R6 - MOVD R6, R4 - MOVD R11, (R2)(R1*1) - - ADD $8, R1 // i*8 + 8 - ADD $1, R7 // i++ - -E6: - CMPBLT R7, R5, L6 // i < n - - MOVD R4, c+24(FP) + MOVD x+8(FP), R3 + MOVD y+16(FP), R5 + + MOVD $0, R6 + +L_ent: + VZERO V0 + VZERO V2 + SRD $2, R4, R10 + TMLL R4, $1 + BRC $8, L_bx0 + +L_bx1: + VLEG $1, 0(R2), V2 + VZERO V4 + TMLL R4, $2 + BRC $7, L_b11 + +L_b01: + MOVD $-24, R4 + MOVD R6, R0 + MOVD 0(R3), R7 + MLGR R5, R6 + ADDC R0, R7 + MOVD $0, R0 + ADDE R0, R6 + VLVGG $1, R7, V4 + VAQ V2, V4, V2 + VSTEG $1, V2, 0(R2) + VMRHG V2, V2, V2 + CMPBEQ R10, $0, L_1 + BR L_cj0 + +L_b11: + MOVD $-8, R4 + MOVD 0(R3), R9 + MLGR R5, R8 + ADDC R6, R9 + MOVD $0, R6 + ADDE R6, R8 + VLVGG $1, R9, V4 + VAQ V2, V4, V2 + VSTEG $1, V2, 0(R2) + VMRHG V2, V2, V2 + BR L_cj1 + +L_bx0: + TMLL R4, $2 + BRC $7, L_b10 + +L_b00: + MOVD $-32, R4 + +L_cj0: + MOVD 32(R3)(R4), R1 + MOVD 40(R3)(R4), R9 + MLGR R5, R0 + MLGR R5, R8 + VL 32(R4)(R2), V1 + VPDI $4, V1, V1, V1 + VLVGP R0, R1, V6 + VLVGP R9, R6, V7 + BR L_mid + +L_b10: + MOVD $-16, R4 + MOVD R6, R8 + +L_cj1: + MOVD 16(R4)(R3), R1 + MOVD 24(R4)(R3), R7 + MLGR R5, R0 + MLGR R5, R6 + VL 16(R4)(R2), V1 + VPDI $4, V1, V1, V1 + VLVGP R0, R1, V6 + VLVGP R7, R8, V7 + CMPBEQ R10, $0, L_end + +L_top: + MOVD 32(R4)(R3), R1 + MOVD 40(R4)(R3), R9 + MLGR R5, R0 + MLGR R5, R8 + VACQ V6, V1, V0, V5 + VACCCQ V6, V1, V0, V0 + VACQ V5, V7, V2, V3 + VACCCQ V5, V7, V2, V2 + VPDI $4, V3, V3, V3 + VL 32(R4)(R2), V1 + VPDI $4, V1, V1, V1 + VST V3, 16(R4)(R2) + VLVGP R0, R1, V6 + VLVGP R9, R6, V7 + +L_mid: + MOVD 48(R4)(R3), R1 + MOVD 56(R4)(R3), R7 + MLGR R5, R0 + MLGR R5, R6 + VACQ V6, V1, V0, V5 + VACCCQ V6, V1, V0, V0 + VACQ V5, V7, V2, V3 + VACCCQ V5, V7, V2, V2 + VPDI $4, V3, V3, V3 + VL 48(R4)(R2), V1 + VPDI $4, V1, V1, V1 + VST V3, 32(R4)(R2) + VLVGP R0, R1, V6 + VLVGP R7, R8, V7 + MOVD $32(R4), R4 + BRCTG R10, L_top + +L_end: + VACQ V6, V1, V0, V5 + VACCCQ V6, V1, V0, V0 + VACQ V5, V7, V2, V3 + VACCCQ V5, V7, V2, V2 + VPDI $4, V3, V3, V3 + VST V3, 16(R2)(R4) + VAG V0, V2, V2 + +L_1: + VLGVG $1, V2, R2 + ADDC R6, R2 + MOVD R2, c+24(FP) RET + -- 2.52.0