]> Cypherpunks repositories - gostls13.git/commitdiff
math/big: implement subVV in riscv64 assembly
authorJoel Sing <joel@sing.id.au>
Thu, 27 Jun 2024 10:02:04 +0000 (20:02 +1000)
committerJoel Sing <joel@sing.id.au>
Thu, 15 Aug 2024 12:06:59 +0000 (12:06 +0000)
This provides an assembly implementation of subVV for riscv64,
processing up to four words per loop, resulting in a significant
performance gain.

On a StarFive VisionFive 2:

               │   subvv.1    │               subvv.2               │
               │    sec/op    │   sec/op     vs base                │
SubVV/1-4         73.46n ± 0%   48.08n ± 0%  -34.55% (p=0.000 n=10)
SubVV/2-4         88.13n ± 0%   58.76n ± 0%  -33.33% (p=0.000 n=10)
SubVV/3-4        102.80n ± 0%   69.45n ± 0%  -32.44% (p=0.000 n=10)
SubVV/4-4        117.50n ± 0%   72.11n ± 0%  -38.63% (p=0.000 n=10)
SubVV/5-4        132.20n ± 0%   82.80n ± 0%  -37.37% (p=0.000 n=10)
SubVV/10-4        216.3n ± 0%   126.9n ± 0%  -41.33% (p=0.000 n=10)
SubVV/100-4      1659.0n ± 0%   886.5n ± 0%  -46.56% (p=0.000 n=10)
SubVV/1000-4     16.089µ ± 0%   8.401µ ± 0%  -47.78% (p=0.000 n=10)
SubVV/10000-4     244.7µ ± 0%   176.8µ ± 0%  -27.74% (p=0.000 n=10)
SubVV/100000-4    2.562m ± 0%   1.871m ± 0%  -26.96% (p=0.000 n=10)
geomean           1.436µ        904.4n       -37.04%

               │   subvv.1    │                subvv.2                │
               │     B/s      │      B/s       vs base                │
SubVV/1-4        830.9Mi ± 0%   1269.5Mi ± 0%  +52.79% (p=0.000 n=10)
SubVV/2-4        1.353Gi ± 0%    2.029Gi ± 0%  +49.99% (p=0.000 n=10)
SubVV/3-4        1.739Gi ± 0%    2.575Gi ± 0%  +48.06% (p=0.000 n=10)
SubVV/4-4        2.029Gi ± 0%    3.306Gi ± 0%  +62.96% (p=0.000 n=10)
SubVV/5-4        2.254Gi ± 0%    3.600Gi ± 0%  +59.67% (p=0.000 n=10)
SubVV/10-4       2.755Gi ± 0%    4.699Gi ± 0%  +70.53% (p=0.000 n=10)
SubVV/100-4      3.594Gi ± 0%    6.723Gi ± 0%  +87.08% (p=0.000 n=10)
SubVV/1000-4     3.705Gi ± 0%    7.095Gi ± 0%  +91.52% (p=0.000 n=10)
SubVV/10000-4    2.436Gi ± 0%    3.372Gi ± 0%  +38.39% (p=0.000 n=10)
SubVV/100000-4   2.327Gi ± 0%    3.185Gi ± 0%  +36.91% (p=0.000 n=10)
geomean          2.118Gi         3.364Gi       +58.84%

Change-Id: I361cb3f4195b27a9f1e9486c9e1fdbeaa94d32b4
Reviewed-on: https://go-review.googlesource.com/c/go/+/595396
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
src/math/big/arith_riscv64.s

index 67812dd646f7342db6571f100df6af0bbfa6eaa0..897d08229eef185c75287f6184b955b3fbed6dea 100644 (file)
@@ -92,7 +92,86 @@ done:
        RET
 
 TEXT ·subVV(SB),NOSPLIT,$0
-       JMP ·subVV_g(SB)
+       MOV     x+24(FP), X5
+       MOV     y+48(FP), X6
+       MOV     z+0(FP), X7
+       MOV     z_len+8(FP), X30
+
+       MOV     $4, X28
+       MOV     $0, X29         // b = 0
+
+       BEQZ    X30, done
+       BLTU    X30, X28, loop1
+
+loop4:
+       MOV     0(X5), X8       // x[0]
+       MOV     0(X6), X9       // y[0]
+       MOV     8(X5), X11      // x[1]
+       MOV     8(X6), X12      // y[1]
+       MOV     16(X5), X14     // x[2]
+       MOV     16(X6), X15     // y[2]
+       MOV     24(X5), X17     // x[3]
+       MOV     24(X6), X18     // y[3]
+
+       SUB     X9, X8, X21     // z[0] = x[0] - y[0]
+       SLTU    X21, X8, X22
+       SUB     X29, X21, X10   // z[0] = x[0] - y[0] - b
+       SLTU    X10, X21, X23
+       ADD     X22, X23, X29   // next b
+
+       SUB     X12, X11, X24   // z[1] = x[1] - y[1]
+       SLTU    X24, X11, X25
+       SUB     X29, X24, X13   // z[1] = x[1] - y[1] - b
+       SLTU    X13, X24, X26
+       ADD     X25, X26, X29   // next b
+
+       SUB     X15, X14, X21   // z[2] = x[2] - y[2]
+       SLTU    X21, X14, X22
+       SUB     X29, X21, X16   // z[2] = x[2] - y[2] - b
+       SLTU    X16, X21, X23
+       ADD     X22, X23, X29   // next b
+
+       SUB     X18, X17, X21   // z[3] = x[3] - y[3]
+       SLTU    X21, X17, X22
+       SUB     X29, X21, X19   // z[3] = x[3] - y[3] - b
+       SLTU    X19, X21, X23
+       ADD     X22, X23, X29   // next b
+
+       MOV     X10, 0(X7)      // z[0]
+       MOV     X13, 8(X7)      // z[1]
+       MOV     X16, 16(X7)     // z[2]
+       MOV     X19, 24(X7)     // z[3]
+
+       ADD     $32, X5
+       ADD     $32, X6
+       ADD     $32, X7
+       SUB     $4, X30
+
+       BGEU    X30, X28, loop4
+       BEQZ    X30, done
+
+loop1:
+       MOV     0(X5), X10      // x
+       MOV     0(X6), X11      // y
+
+       SUB     X11, X10, X12   // z = x - y
+       SLTU    X12, X10, X14
+       SUB     X29, X12, X13   // z = x - y - b
+       SLTU    X13, X12, X15
+       ADD     X14, X15, X29   // next b
+
+       MOV     X13, 0(X7)      // z
+
+       ADD     $8, X5
+       ADD     $8, X6
+       ADD     $8, X7
+       SUB     $1, X30
+
+       BNEZ    X30, loop1
+
+done:
+       MOV     X29, c+72(FP)   // return b
+       RET
 
 TEXT ·addVW(SB),NOSPLIT,$0
        JMP ·addVW_g(SB)