From fd3d27938afa1a5df94f1e056deeb5842855fbdc Mon Sep 17 00:00:00 2001 From: Vlad Krasnov Date: Mon, 6 Nov 2017 13:59:51 -0800 Subject: [PATCH] math/big: implement addMulVVW on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The lack of proper addMulVVW implementation for arm64 hurts RSA performance. This assembly implementation is optimized for arm64 based servers. name old time/op new time/op delta pkg:math/big goos:linux goarch:arm64 AddMulVVW/1 55.2ns ± 0% 11.9ns ± 1% -78.37% (p=0.000 n=8+10) AddMulVVW/2 67.0ns ± 0% 11.2ns ± 0% -83.28% (p=0.000 n=7+10) AddMulVVW/3 93.2ns ± 0% 13.2ns ± 0% -85.84% (p=0.000 n=10+10) AddMulVVW/4 126ns ± 0% 13ns ± 1% -89.82% (p=0.000 n=10+10) AddMulVVW/5 151ns ± 0% 17ns ± 0% -88.87% (p=0.000 n=10+9) AddMulVVW/10 323ns ± 0% 25ns ± 0% -92.20% (p=0.000 n=10+10) AddMulVVW/100 3.28µs ± 0% 0.14µs ± 0% -95.82% (p=0.000 n=10+10) AddMulVVW/1000 31.7µs ± 0% 1.3µs ± 0% -96.00% (p=0.000 n=10+8) AddMulVVW/10000 313µs ± 0% 13µs ± 0% -95.98% (p=0.000 n=10+10) AddMulVVW/100000 3.24ms ± 0% 0.13ms ± 1% -96.13% (p=0.000 n=9+9) pkg:crypto/rsa goos:linux goarch:arm64 RSA2048Decrypt 44.7ms ± 0% 4.0ms ± 6% -91.08% (p=0.000 n=8+10) RSA2048Sign 46.3ms ± 0% 5.0ms ± 0% -89.29% (p=0.000 n=9+10) 3PrimeRSA2048Decrypt 22.3ms ± 0% 2.4ms ± 0% -89.26% (p=0.000 n=10+10) Change-Id: I295f0bd5c51a4442d02c44ece1f6026d30dff0bc Reviewed-on: https://go-review.googlesource.com/76270 Reviewed-by: Vlad Krasnov Reviewed-by: Cherry Zhang Run-TryBot: Vlad Krasnov TryBot-Result: Gobot Gobot --- src/math/big/arith_arm64.s | 83 +++++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s index 8b4b7136fa..0974c97c57 100644 --- a/src/math/big/arith_arm64.s +++ b/src/math/big/arith_arm64.s @@ -199,8 +199,89 @@ done: // func addMulVVW(z, x []Word, y Word) (c Word) TEXT ·addMulVVW(SB),NOSPLIT,$0 - B ·addMulVVW_g(SB) + MOVD z+0(FP), R1 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R2 + MOVD y+48(FP), R3 + MOVD $0, R4 + + TBZ $0, R0, two + + MOVD.P 8(R2), R5 + MOVD (R1), R6 + + MUL R5, R3, R7 + UMULH R5, R3, R8 + + ADDS R7, R6 + ADC $0, R8, R4 + + MOVD.P R6, 8(R1) + SUB $1, R0 + +two: + TBZ $1, R0, loop + + LDP.P 16(R2), (R5, R10) + LDP (R1), (R6, R11) + + MUL R10, R3, R13 + UMULH R10, R3, R12 + + MUL R5, R3, R7 + UMULH R5, R3, R8 + ADDS R4, R6 + ADCS R13, R11 + ADC $0, R12 + + ADDS R7, R6 + ADCS R8, R11 + ADC $0, R12, R4 + + STP.P (R6, R11), 16(R1) + SUB $2, R0 + +// The main loop of this code operates on a block of 4 words every iteration +// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] +// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next +// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. +loop: + CBZ R0, done + + LDP.P 16(R2), (R5, R6) + LDP.P 16(R2), (R7, R8) + + LDP (R1), (R9, R10) + ADDS R4, R9 + MUL R6, R3, R14 + ADCS R14, R10 + MUL R7, R3, R15 + LDP 16(R1), (R11, R12) + ADCS R15, R11 + MUL R8, R3, R16 + ADCS R16, R12 + UMULH R8, R3, R20 + ADC $0, R20 + + MUL R5, R3, R13 + ADDS R13, R9 + UMULH R5, R3, R17 + ADCS R17, R10 + UMULH R6, R3, R21 + STP.P (R9, R10), 16(R1) + ADCS R21, R11 + UMULH R7, R3, R19 + ADCS R19, R12 + STP.P (R11, R12), 16(R1) + ADC $0, R20, R4 + + SUB $4, R0 + B loop + +done: + MOVD R4, c+56(FP) + RET // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) TEXT ·divWVW(SB),NOSPLIT,$0 -- 2.50.0