]> Cypherpunks repositories - gostls13.git/commitdiff
math/big: optimize subVW function for loong64
authorHuang Qiqi <huangqiqi@loongson.cn>
Tue, 11 Jun 2024 12:33:50 +0000 (20:33 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Tue, 15 Apr 2025 11:56:10 +0000 (04:56 -0700)
Benchmark results on Loongson 3C5000 (which is an LA464 implementation):

goos: linux
goarch: loong64
pkg: math/big
cpu: Loongson-3C5000 @ 2200.00MHz
                │ test/old_3c5000_subvw.log │      test/new_3c5000_subvw.log      │
                │          sec/op           │   sec/op     vs base                │
SubVW/1                         8.564n ± 0%   5.915n ± 0%  -30.93% (p=0.000 n=20)
SubVW/2                        11.675n ± 0%   6.825n ± 0%  -41.54% (p=0.000 n=20)
SubVW/3                        13.410n ± 0%   7.969n ± 0%  -40.57% (p=0.000 n=20)
SubVW/4                        15.300n ± 0%   9.740n ± 0%  -36.34% (p=0.000 n=20)
SubVW/5                         17.34n ± 1%   10.66n ± 0%  -38.55% (p=0.000 n=20)
SubVW/10                        26.55n ± 0%   15.21n ± 0%  -42.70% (p=0.000 n=20)
SubVW/100                       199.2n ± 0%   102.5n ± 0%  -48.52% (p=0.000 n=20)
SubVW/1000                     1866.5n ± 1%   924.6n ± 0%  -50.46% (p=0.000 n=20)
SubVW/10000                     17.67µ ± 2%   12.04µ ± 2%  -31.83% (p=0.000 n=20)
SubVW/100000                    186.4µ ± 0%   132.0µ ± 0%  -29.17% (p=0.000 n=20)
SubVWext/1                      8.616n ± 0%   5.949n ± 0%  -30.95% (p=0.000 n=20)
SubVWext/2                     11.410n ± 0%   7.008n ± 1%  -38.58% (p=0.000 n=20)
SubVWext/3                     13.255n ± 1%   8.073n ± 0%  -39.09% (p=0.000 n=20)
SubVWext/4                     15.095n ± 0%   9.893n ± 0%  -34.47% (p=0.000 n=20)
SubVWext/5                      16.87n ± 0%   10.86n ± 0%  -35.63% (p=0.000 n=20)
SubVWext/10                     26.00n ± 0%   15.54n ± 0%  -40.22% (p=0.000 n=20)
SubVWext/100                    196.0n ± 0%   104.3n ± 1%  -46.76% (p=0.000 n=20)
SubVWext/1000                  1847.0n ± 0%   923.7n ± 0%  -49.99% (p=0.000 n=20)
SubVWext/10000                  17.30µ ± 1%   11.71µ ± 1%  -32.31% (p=0.000 n=20)
SubVWext/100000                 187.5µ ± 0%   131.6µ ± 0%  -29.82% (p=0.000 n=20)
geomean                         159.7n        97.79n       -38.79%

Change-Id: I21a6903e79b02cb22282e80c9bfe2ae9f1a87589
Reviewed-on: https://go-review.googlesource.com/c/go/+/659878
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
src/math/big/arith_loong64.s

index a0130efc31737392ff8a0a117013aa612b491e36..41229a1f0fda442d972f4be3ef5372c20de8c81c 100644 (file)
@@ -40,8 +40,30 @@ done:
        MOVV    R10, c+56(FP)
        RET
 
+// func subVW(z, x []Word, y Word) (c Word)
 TEXT ·subVW(SB),NOSPLIT,$0
-       JMP ·subVW_g(SB)
+       // input:
+       //   R4: z
+       //   R5: z_len
+       //   R7: x
+       //   R10: y
+       MOVV    z+0(FP), R4
+       MOVV    z_len+8(FP), R5
+       MOVV    x+24(FP), R7
+       MOVV    y+48(FP), R10
+       MOVV    $0, R6
+       SLLV    $3, R5
+loop:
+       BEQ     R5, R6, done
+       MOVV    (R6)(R7), R8
+       SUBV    R10, R8, R11    // x1 - c = z1, if z1 > x1 then overflow
+       SGTU    R11, R8, R10
+       MOVV    R11, (R6)(R4)
+       ADDV    $8, R6
+       JMP     loop
+done:
+       MOVV    R10, c+56(FP)
+       RET
 
 TEXT ·lshVU(SB),NOSPLIT,$0
        JMP ·lshVU_g(SB)