]> Cypherpunks repositories - gostls13.git/commitdiff
math/big: optimize mulAddVWW on arm64 for better performance
authorerifan01 <eric.fang@arm.com>
Wed, 16 May 2018 06:25:07 +0000 (06:25 +0000)
committerCherry Zhang <cherryyz@google.com>
Mon, 22 Apr 2019 14:45:16 +0000 (14:45 +0000)
Unroll the cycle 4 times to reduce load overhead.

Benchmarks:
name                old time/op    new time/op    delta
MulAddVWW/1-8         15.9ns ± 0%    11.9ns ± 0%  -24.92%  (p=0.000 n=8+8)
MulAddVWW/2-8         16.1ns ± 0%    13.9ns ± 1%  -13.82%  (p=0.000 n=8+8)
MulAddVWW/3-8         18.9ns ± 0%    17.3ns ± 0%   -8.47%  (p=0.000 n=8+8)
MulAddVWW/4-8         21.7ns ± 0%    19.5ns ± 0%  -10.14%  (p=0.000 n=8+8)
MulAddVWW/5-8         25.1ns ± 0%    22.5ns ± 0%  -10.27%  (p=0.000 n=8+8)
MulAddVWW/10-8        41.6ns ± 0%    40.0ns ± 0%   -3.79%  (p=0.000 n=8+8)
MulAddVWW/100-8        368ns ± 0%     363ns ± 0%   -1.36%  (p=0.000 n=8+8)
MulAddVWW/1000-8      3.52µs ± 0%    3.52µs ± 0%   -0.14%  (p=0.000 n=8+8)
MulAddVWW/10000-8     35.1µs ± 0%    35.1µs ± 0%   -0.01%  (p=0.000 n=7+6)
MulAddVWW/100000-8     351µs ± 0%     351µs ± 0%   +0.15%  (p=0.038 n=8+8)

Change-Id: I052a4db286ac6e4f3293289c7e9a82027da0405e
Reviewed-on: https://go-review.googlesource.com/c/go/+/155780
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
src/math/big/arith_arm64.s
src/math/big/arith_test.go

index bb23751ba3b9b44a6fbcfac01491413eead469d1..114d5f67f2a6d0666dbf5a54e705c000505f2e60 100644 (file)
@@ -363,16 +363,51 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
        MOVD    x+24(FP), R2
        MOVD    y+48(FP), R3
        MOVD    r+56(FP), R4
-loop:
-       CBZ     R0, done
+       // c, z = x * y + r
+       TBZ     $0, R0, two
        MOVD.P  8(R2), R5
-       UMULH   R5, R3, R7
-       MUL     R5, R3, R6
-       ADDS    R4, R6
-       ADC     $0, R7
-       MOVD.P  R6, 8(R1)
-       MOVD    R7, R4
+       MUL     R3, R5, R7
+       UMULH   R3, R5, R8
+       ADDS    R4, R7
+       ADC     $0, R8, R4      // c, z[i] = x[i] * y +  r
+       MOVD.P  R7, 8(R1)
        SUB     $1, R0
+two:
+       TBZ     $1, R0, loop
+       LDP.P   16(R2), (R5, R6)
+       MUL     R3, R5, R10
+       UMULH   R3, R5, R11
+       ADDS    R4, R10
+       MUL     R3, R6, R12
+       UMULH   R3, R6, R13
+       ADCS    R12, R11
+       ADC     $0, R13, R4
+
+       STP.P   (R10, R11), 16(R1)
+       SUB     $2, R0
+loop:
+       CBZ     R0, done
+       LDP.P   32(R2), (R5, R6)
+       LDP     -16(R2), (R7, R8)
+
+       MUL     R3, R5, R10
+       UMULH   R3, R5, R11
+       ADDS    R4, R10
+       MUL     R3, R6, R12
+       UMULH   R3, R6, R13
+       ADCS    R11, R12
+
+       MUL     R3, R7, R14
+       UMULH   R3, R7, R15
+       ADCS    R13, R14
+       MUL     R3, R8, R16
+       UMULH   R3, R8, R17
+       ADCS    R15, R16
+       ADC     $0, R17, R4
+
+       STP.P   (R10, R12), 32(R1)
+       STP     (R14, R16), -16(R1)
+       SUB     $4, R0
        B       loop
 done:
        MOVD    R4, c+64(FP)
index 8a643211024623611dec82cc0b8b0819b2f63128..d28f6806881584d0abe6a7c1afc3e63c5bf43a1c 100644 (file)
@@ -371,6 +371,24 @@ func TestMulAddWWW(t *testing.T) {
        }
 }
 
+func BenchmarkMulAddVWW(b *testing.B) {
+       for _, n := range benchSizes {
+               if isRaceBuilder && n > 1e3 {
+                       continue
+               }
+               z := make([]Word, n+1)
+               x := rndV(n)
+               y := rndW()
+               r := rndW()
+               b.Run(fmt.Sprint(n), func(b *testing.B) {
+                       b.SetBytes(int64(n * _W))
+                       for i := 0; i < b.N; i++ {
+                               mulAddVWW(z, x, y, r)
+                       }
+               })
+       }
+}
+
 func BenchmarkAddMulVVW(b *testing.B) {
        for _, n := range benchSizes {
                if isRaceBuilder && n > 1e3 {