]> Cypherpunks repositories - gostls13.git/commitdiff
math/big: optimize mulAddVWW function for loong64
authorHuang Qiqi <huangqiqi@loongson.cn>
Wed, 19 Jun 2024 06:31:00 +0000 (06:31 +0000)
committerabner chenc <chenguoqi@loongson.cn>
Tue, 15 Apr 2025 11:56:20 +0000 (04:56 -0700)
Benchmark results on Loongson 3A5000 (which is an LA464 implementation):

goos: linux
goarch: loong64
pkg: math/big
cpu: Loongson-3A5000-HV @ 2500.00MHz
                 │ test/old_3a5000_muladdvww.log │    test/new_3a5000_muladdvww.log    │
                 │            sec/op             │   sec/op     vs base                │
MulAddVWW/1                          7.606n ± 0%   6.987n ± 0%   -8.14% (p=0.000 n=20)
MulAddVWW/2                          9.207n ± 0%   8.567n ± 0%   -6.95% (p=0.000 n=20)
MulAddVWW/3                         10.810n ± 0%   9.223n ± 0%  -14.68% (p=0.000 n=20)
MulAddVWW/4                          13.01n ± 0%   12.41n ± 0%   -4.61% (p=0.000 n=20)
MulAddVWW/5                          15.79n ± 0%   12.99n ± 0%  -17.73% (p=0.000 n=20)
MulAddVWW/10                         25.62n ± 0%   20.02n ± 0%  -21.86% (p=0.000 n=20)
MulAddVWW/100                        217.0n ± 0%   170.9n ± 0%  -21.24% (p=0.000 n=20)
MulAddVWW/1000                       2.064µ ± 0%   1.612µ ± 0%  -21.90% (p=0.000 n=20)
MulAddVWW/10000                      24.50µ ± 0%   16.74µ ± 0%  -31.66% (p=0.000 n=20)
MulAddVWW/100000                     239.1µ ± 0%   171.1µ ± 0%  -28.45% (p=0.000 n=20)
geomean                              159.2n        130.3n       -18.18%

Change-Id: I063434bc382f4f1234f879172ab671a3d6f2eb80
Reviewed-on: https://go-review.googlesource.com/c/go/+/659881
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/math/big/arith_loong64.s

index 41229a1f0fda442d972f4be3ef5372c20de8c81c..9b22a2655347971f97bc61445ef16004b1983c0b 100644 (file)
@@ -71,8 +71,35 @@ TEXT ·lshVU(SB),NOSPLIT,$0
 TEXT ·rshVU(SB),NOSPLIT,$0
        JMP ·rshVU_g(SB)
 
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
 TEXT ·mulAddVWW(SB),NOSPLIT,$0
-       JMP ·mulAddVWW_g(SB)
+       // input:
+       //   R4: z
+       //   R5: z_len
+       //   R7: x
+       //   R10: y
+       //   R11: r
+       MOVV    z+0(FP), R4
+       MOVV    z_len+8(FP), R5
+       MOVV    x+24(FP), R7
+       MOVV    y+48(FP), R10
+       MOVV    r+56(FP), R11
+       SLLV    $3, R5
+       MOVV    $0, R6
+loop:
+       BEQ     R5, R6, done
+       MOVV    (R6)(R7), R8
+       MULV    R8, R10, R9
+       MULHVU  R8, R10, R12
+       ADDV    R9, R11, R8
+       SGTU    R9, R8, R11     // if (c' = lo + c) < lo then overflow
+       MOVV    R8, (R6)(R4)
+       ADDV    R12, R11
+       ADDV    $8, R6
+       JMP     loop
+done:
+       MOVV    R11, c+64(FP)
+       RET
 
 TEXT ·addMulVVWW(SB),NOSPLIT,$0
        JMP ·addMulVVWW_g(SB)