]> Cypherpunks repositories - gostls13.git/commitdiff
math/big: improve performance of addVW/subVW for ppc64x
authorCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Mon, 19 Mar 2018 22:23:34 +0000 (19:23 -0300)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Tue, 27 Mar 2018 15:06:53 +0000 (15:06 +0000)
This change adds a better implementation in asm for addVW/subVW for
ppc64x, with speedups up to 3.11x.

benchmark                    old ns/op     new ns/op     delta
BenchmarkAddVW/1-16          6.87          5.71          -16.89%
BenchmarkAddVW/2-16          7.72          5.94          -23.06%
BenchmarkAddVW/3-16          8.74          6.56          -24.94%
BenchmarkAddVW/4-16          9.66          7.26          -24.84%
BenchmarkAddVW/5-16          10.8          7.26          -32.78%
BenchmarkAddVW/10-16         17.4          9.97          -42.70%
BenchmarkAddVW/100-16        164           56.0          -65.85%
BenchmarkAddVW/1000-16       1638          524           -68.01%
BenchmarkAddVW/10000-16      16421         5201          -68.33%
BenchmarkAddVW/100000-16     165762        53324         -67.83%
BenchmarkSubVW/1-16          6.76          5.62          -16.86%
BenchmarkSubVW/2-16          7.69          6.02          -21.72%
BenchmarkSubVW/3-16          8.85          6.61          -25.31%
BenchmarkSubVW/4-16          10.0          7.34          -26.60%
BenchmarkSubVW/5-16          11.3          7.33          -35.13%
BenchmarkSubVW/10-16         19.5          18.7          -4.10%
BenchmarkSubVW/100-16        153           55.9          -63.46%
BenchmarkSubVW/1000-16       1502          519           -65.45%
BenchmarkSubVW/10000-16      15005         5165          -65.58%
BenchmarkSubVW/100000-16     150620        53124         -64.73%

benchmark                    old MB/s     new MB/s     speedup
BenchmarkAddVW/1-16          1165.12      1400.76      1.20x
BenchmarkAddVW/2-16          2071.39      2693.25      1.30x
BenchmarkAddVW/3-16          2744.72      3656.92      1.33x
BenchmarkAddVW/4-16          3311.63      4407.34      1.33x
BenchmarkAddVW/5-16          3700.52      5512.48      1.49x
BenchmarkAddVW/10-16         4605.63      8026.37      1.74x
BenchmarkAddVW/100-16        4856.15      14296.76     2.94x
BenchmarkAddVW/1000-16       4883.96      15264.21     3.13x
BenchmarkAddVW/10000-16      4871.52      15380.78     3.16x
BenchmarkAddVW/100000-16     4826.17      15002.48     3.11x
BenchmarkSubVW/1-16          1183.20      1423.03      1.20x
BenchmarkSubVW/2-16          2081.92      2657.44      1.28x
BenchmarkSubVW/3-16          2711.52      3632.30      1.34x
BenchmarkSubVW/4-16          3198.30      4360.30      1.36x
BenchmarkSubVW/5-16          3534.43      5460.40      1.54x
BenchmarkSubVW/10-16         4106.34      4273.51      1.04x
BenchmarkSubVW/100-16        5213.48      14306.32     2.74x
BenchmarkSubVW/1000-16       5324.27      15391.21     2.89x
BenchmarkSubVW/10000-16      5331.33      15486.57     2.90x
BenchmarkSubVW/100000-16     5311.35      15059.01     2.84x

Change-Id: Ibaa5b9b38d63fba8e01a9c327eb8bef1e6e908c1
Reviewed-on: https://go-review.googlesource.com/101975
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
src/math/big/arith_ppc64x.s

index 74db48933f8a4c14f703eddd2efd57c3cd121ec1..b3ac91e35ed4350487b17b6d117736591fc2b729 100644 (file)
@@ -84,11 +84,155 @@ sublend:
        MOVD  R4, c+72(FP)
        RET
 
+// func addVW(z, x []Word, y Word) (c Word)
 TEXT ·addVW(SB), NOSPLIT, $0
-       BR ·addVW_g(SB)
+       MOVD z+0(FP), R10       // R10 = z[]
+       MOVD x+24(FP), R8       // R8 = x[]
+       MOVD y+48(FP), R4       // R4 = y = c
+       MOVD z_len+8(FP), R11   // R11 = z_len
+
+       CMP   R0, R11           // If z_len is zero, return
+       BEQ   done
 
+       // We will process the first iteration out of the loop so we capture
+       // the value of c. In the subsequent iterations, we will rely on the
+       // value of CA set here.
+       MOVD  0(R8), R20        // R20 = x[i]
+       ADD   $-1, R11          // R11 = z_len - 1
+       ADDC  R20, R4, R6       // R6 = x[i] + c
+       CMP   R0, R11           // If z_len was 1, we are done
+       MOVD  R6, 0(R10)        // z[i]
+       BEQ   final
+
+       // We will read 4 elements per iteration
+       SRD   $2, R11, R9       // R9 = z_len/4
+       DCBT  (R8)
+       CMP   R0, R9
+       MOVD  R9, CTR           // Set up the loop counter
+       BEQ   tail              // If R9 = 0, we can't use the loop
+
+loop:
+       MOVD  8(R8), R20        // R20 = x[i]
+       MOVD  16(R8), R21       // R21 = x[i+1]
+       MOVD  24(R8), R22       // R22 = x[i+2]
+       MOVDU 32(R8), R23       // R23 = x[i+3]
+       ADDZE R20, R24          // R24 = x[i] + CA
+       ADDZE R21, R25          // R25 = x[i+1] + CA
+       ADDZE R22, R26          // R26 = x[i+2] + CA
+       ADDZE R23, R27          // R27 = x[i+3] + CA
+       MOVD  R24, 8(R10)       // z[i]
+       MOVD  R25, 16(R10)      // z[i+1]
+       MOVD  R26, 24(R10)      // z[i+2]
+       MOVDU R27, 32(R10)      // z[i+3]
+       ADD   $-4, R11          // R11 = z_len - 4
+       BC    16, 0, loop       // bdnz
+
+       // We may have some elements to read
+       CMP R0, R11
+       BEQ final
+
+tail:
+       MOVDU 8(R8), R20
+       ADDZE R20, R24
+       ADD $-1, R11
+       MOVDU R24, 8(R10)
+       CMP R0, R11
+       BEQ final
+
+       MOVDU 8(R8), R20
+       ADDZE R20, R24
+       ADD $-1, R11
+       MOVDU R24, 8(R10)
+       CMP R0, R11
+       BEQ final
+
+       MOVD 8(R8), R20
+       ADDZE R20, R24
+       MOVD R24, 8(R10)
+
+final:
+       ADDZE R0, R4            // c = CA
+done:
+       MOVD  R4, c+56(FP)
+       RET
+
+// func subVW(z, x []Word, y Word) (c Word)
 TEXT ·subVW(SB), NOSPLIT, $0
-       BR ·subVW_g(SB)
+       MOVD  z+0(FP), R10      // R10 = z[]
+       MOVD  x+24(FP), R8      // R8 = x[]
+       MOVD  y+48(FP), R4      // R4 = y = c
+       MOVD  z_len+8(FP), R11  // R11 = z_len
+
+       CMP   R0, R11           // If z_len is zero, return
+       BEQ   done
+
+       // We will process the first iteration out of the loop so we capture
+       // the value of c. In the subsequent iterations, we will rely on the
+       // value of CA set here.
+       MOVD  0(R8), R20        // R20 = x[i]
+       ADD   $-1, R11          // R11 = z_len - 1
+       SUBC  R4, R20, R6       // R6 = x[i] - c
+       CMP   R0, R11           // If z_len was 1, we are done
+       MOVD  R6, 0(R10)        // z[i]
+       BEQ   final
+
+       // We will read 4 elements per iteration
+       SRD   $2, R11, R9       // R9 = z_len/4
+       DCBT  (R8)
+       CMP   R0, R9
+       MOVD  R9, CTR           // Set up the loop counter
+       BEQ   tail              // If R9 = 0, we can't use the loop
+
+       // The loop here is almost the same as the one used in s390x, but
+       // we don't need to capture CA every iteration because we've already
+       // done that above.
+loop:
+       MOVD  8(R8), R20
+       MOVD  16(R8), R21
+       MOVD  24(R8), R22
+       MOVDU 32(R8), R23
+       SUBE  R0, R20
+       SUBE  R0, R21
+       SUBE  R0, R22
+       SUBE  R0, R23
+       MOVD  R20, 8(R10)
+       MOVD  R21, 16(R10)
+       MOVD  R22, 24(R10)
+       MOVDU R23, 32(R10)
+       ADD   $-4, R11
+       BC    16, 0, loop       // bdnz
+
+       // We may have some elements to read
+       CMP   R0, R11
+       BEQ   final
+
+tail:
+       MOVDU 8(R8), R20
+       SUBE  R0, R20
+       ADD   $-1, R11
+       MOVDU R20, 8(R10)
+       CMP   R0, R11
+       BEQ   final
+
+       MOVDU 8(R8), R20
+       SUBE  R0, R20
+       ADD   $-1, R11
+       MOVDU R20, 8(R10)
+       CMP   R0, R11
+       BEQ   final
+
+       MOVD  8(R8), R20
+       SUBE  R0, R20
+       MOVD  R20, 8(R10)
+
+final:
+       // Capture CA
+       SUBE  R4, R4
+       NEG   R4, R4
+
+done:
+       MOVD  R4, c+56(FP)
+       RET
 
 TEXT ·shlVU(SB), NOSPLIT, $0
        BR ·shlVU_g(SB)