Benchmark results on Loongson 3A5000 (which is an LA464 implementation):
goos: linux
goarch: loong64
pkg: math/big
cpu: Loongson-3A5000-HV @ 2500.00MHz
│ test/old_3a5000_muladdvww.log │ test/new_3a5000_muladdvww.log │
│ sec/op │ sec/op vs base │
MulAddVWW/1 7.606n ± 0% 6.987n ± 0% -8.14% (p=0.000 n=20)
MulAddVWW/2 9.207n ± 0% 8.567n ± 0% -6.95% (p=0.000 n=20)
MulAddVWW/3 10.810n ± 0% 9.223n ± 0% -14.68% (p=0.000 n=20)
MulAddVWW/4 13.01n ± 0% 12.41n ± 0% -4.61% (p=0.000 n=20)
MulAddVWW/5 15.79n ± 0% 12.99n ± 0% -17.73% (p=0.000 n=20)
MulAddVWW/10 25.62n ± 0% 20.02n ± 0% -21.86% (p=0.000 n=20)
MulAddVWW/100 217.0n ± 0% 170.9n ± 0% -21.24% (p=0.000 n=20)
MulAddVWW/1000 2.064µ ± 0% 1.612µ ± 0% -21.90% (p=0.000 n=20)
MulAddVWW/10000 24.50µ ± 0% 16.74µ ± 0% -31.66% (p=0.000 n=20)
MulAddVWW/100000 239.1µ ± 0% 171.1µ ± 0% -28.45% (p=0.000 n=20)
geomean 159.2n 130.3n -18.18%
Change-Id: I063434bc382f4f1234f879172ab671a3d6f2eb80
Reviewed-on: https://go-review.googlesource.com/c/go/+/659881
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
TEXT ·rshVU(SB),NOSPLIT,$0
JMP ·rshVU_g(SB)
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB),NOSPLIT,$0
- JMP ·mulAddVWW_g(SB)
+ // input:
+ // R4: z
+ // R5: z_len
+ // R7: x
+ // R10: y
+ // R11: r
+ MOVV z+0(FP), R4
+ MOVV z_len+8(FP), R5
+ MOVV x+24(FP), R7
+ MOVV y+48(FP), R10
+ MOVV r+56(FP), R11
+ SLLV $3, R5
+ MOVV $0, R6
+loop:
+ BEQ R5, R6, done
+ MOVV (R6)(R7), R8
+ MULV R8, R10, R9
+ MULHVU R8, R10, R12
+ ADDV R9, R11, R8
+ SGTU R9, R8, R11 // if (c' = lo + c) < lo then overflow
+ MOVV R8, (R6)(R4)
+ ADDV R12, R11
+ ADDV $8, R6
+ JMP loop
+done:
+ MOVV R11, c+64(FP)
+ RET
TEXT ·addMulVVWW(SB),NOSPLIT,$0
JMP ·addMulVVWW_g(SB)