]> Cypherpunks repositories - gostls13.git/commitdiff
math/big,crypto/internal/bigmod: unroll loop in addMulVVW for ppc64x
authorLynn Boger <laboger@linux.vnet.ibm.com>
Tue, 23 Jan 2024 18:46:05 +0000 (12:46 -0600)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Thu, 25 Jan 2024 19:32:43 +0000 (19:32 +0000)
This updates the assembly implementation of AddMulVVW to
unroll the main loop to do 64 bytes at a time.

The code for addMulVVWx is based on the same code and has
also been updated to improve performance.

goos: linux
goarch: ppc64le
pkg: crypto/internal/bigmod
cpu: POWER10
               │ bg.orig.out │               bg.out               │
               │   sec/op    │   sec/op     vs base               │
ModAdd           116.3n ± 0%   116.9n ± 0%   +0.52% (p=0.002 n=6)
ModSub           111.5n ± 0%   111.5n ± 0%    0.00% (p=0.273 n=6)
MontgomeryRepr   2.195µ ± 0%   1.944µ ± 0%  -11.44% (p=0.002 n=6)
MontgomeryMul    2.195µ ± 0%   1.943µ ± 0%  -11.48% (p=0.002 n=6)
ModMul           4.418µ ± 0%   3.900µ ± 0%  -11.72% (p=0.002 n=6)
ExpBig           5.736m ± 0%   5.117m ± 0%  -10.78% (p=0.002 n=6)
Exp              5.891m ± 0%   5.237m ± 0%  -11.11% (p=0.002 n=6)
geomean          9.901µ        9.094µ        -8.15%

goos: linux
goarch: ppc64le
pkg: math/big
cpu: POWER10
                 │ am.orig.out  │               am.out               │
                 │    sec/op    │   sec/op     vs base               │
AddMulVVW/1         4.456n ± 1%   3.565n ± 0%  -20.00% (p=0.002 n=6)
AddMulVVW/2         4.875n ± 1%   5.938n ± 1%  +21.79% (p=0.002 n=6)
AddMulVVW/3         5.484n ± 0%   5.693n ± 0%   +3.80% (p=0.002 n=6)
AddMulVVW/4         6.370n ± 0%   6.065n ± 0%   -4.79% (p=0.002 n=6)
AddMulVVW/5         7.321n ± 0%   7.188n ± 0%   -1.82% (p=0.002 n=6)
AddMulVVW/10        12.26n ± 8%   11.41n ± 0%   -6.97% (p=0.002 n=6)
AddMulVVW/100      100.70n ± 0%   93.58n ± 0%   -7.08% (p=0.002 n=6)
AddMulVVW/1000      938.6n ± 0%   845.5n ± 0%   -9.92% (p=0.002 n=6)
AddMulVVW/10000     9.459µ ± 0%   8.415µ ± 0%  -11.04% (p=0.002 n=6)
AddMulVVW/100000    94.57µ ± 0%   84.01µ ± 0%  -11.16% (p=0.002 n=6)
geomean             75.17n        71.21n        -5.27%

Change-Id: Idd79f5f02387564f4c2cc28d50b1c12bcd9a400f
Reviewed-on: https://go-review.googlesource.com/c/go/+/557915
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Paul Murphy <murp@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
src/crypto/internal/bigmod/nat_ppc64x.s
src/math/big/arith_ppc64x.s

index 974f4f945e49d097b9ef5bfd517d539ec973814a..94260ca29f3c328e762fe8e5adf7bd7332a5e841 100644 (file)
@@ -8,44 +8,75 @@
 
 // func addMulVVW1024(z, x *uint, y uint) (c uint)
 TEXT ·addMulVVW1024(SB), $0-32
-       MOVD    $16, R22 // R22 = z_len
-       JMP             addMulVVWx(SB)
+       MOVD    $4, R6 // R6 = z_len/4
+       JMP             addMulVVWx<>(SB)
 
 // func addMulVVW1536(z, x *uint, y uint) (c uint)
 TEXT ·addMulVVW1536(SB), $0-32
-       MOVD    $24, R22 // R22 = z_len
-       JMP             addMulVVWx(SB)
+       MOVD    $6, R6 // R6 = z_len/4
+       JMP             addMulVVWx<>(SB)
 
 // func addMulVVW2048(z, x *uint, y uint) (c uint)
 TEXT ·addMulVVW2048(SB), $0-32
-       MOVD    $32, R22 // R22 = z_len
-       JMP             addMulVVWx(SB)
+       MOVD    $8, R6 // R6 = z_len/4
+       JMP             addMulVVWx<>(SB)
 
-TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
-       MOVD z+0(FP), R10       // R10 = z[]
-       MOVD x+8(FP), R8        // R8 = x[]
-       MOVD y+16(FP), R9       // R9 = y
+// This local function expects to be called only by
+// callers above. R6 contains the z length/4
+// since 4 values are processed for each
+// loop iteration, and is guaranteed to be > 0.
+// If other callers are added this function might
+// need to change.
+TEXT addMulVVWx<>(SB), NOSPLIT, $0
+       MOVD    z+0(FP), R3
+       MOVD    x+8(FP), R4
+       MOVD    y+16(FP), R5
 
-       MOVD R0, R3             // R3 will be the index register
-       CMP  R0, R22
-       MOVD R0, R4             // R4 = c = 0
-       MOVD R22, CTR           // Initialize loop counter
-       BEQ  done
-       PCALIGN $16
+       MOVD    $0, R9          // R9 = c = 0
+       MOVD    R6, CTR         // Initialize loop counter
+       PCALIGN $16
 
 loop:
-       MOVD  (R8)(R3), R20     // Load x[i]
-       MOVD  (R10)(R3), R21    // Load z[i]
-       MULLD  R9, R20, R6      // R6 = Low-order(x[i]*y)
-       MULHDU R9, R20, R7      // R7 = High-order(x[i]*y)
-       ADDC   R21, R6          // R6 = z0
-       ADDZE  R7               // R7 = z1
-       ADDC   R4, R6           // R6 = z0 + c + 0
-       ADDZE  R7, R4           // c += z1
-       MOVD   R6, (R10)(R3)    // Store z[i]
-       ADD    $8, R3
-       BC  16, 0, loop         // bdnz
+       MOVD    0(R4), R14      // x[i]
+       MOVD    8(R4), R16      // x[i+1]
+       MOVD    16(R4), R18     // x[i+2]
+       MOVD    24(R4), R20     // x[i+3]
+       MOVD    0(R3), R15      // z[i]
+       MOVD    8(R3), R17      // z[i+1]
+       MOVD    16(R3), R19     // z[i+2]
+       MOVD    24(R3), R21     // z[i+3]
+       MULLD   R5, R14, R10    // low x[i]*y
+       MULHDU  R5, R14, R11    // high x[i]*y
+       ADDC    R15, R10
+       ADDZE   R11
+       ADDC    R9, R10
+       ADDZE   R11, R9
+       MULLD   R5, R16, R14    // low x[i+1]*y
+       MULHDU  R5, R16, R15    // high x[i+1]*y
+       ADDC    R17, R14
+       ADDZE   R15
+       ADDC    R9, R14
+       ADDZE   R15, R9
+       MULLD   R5, R18, R16    // low x[i+2]*y
+       MULHDU  R5, R18, R17    // high x[i+2]*y
+       ADDC    R19, R16
+       ADDZE   R17
+       ADDC    R9, R16
+       ADDZE   R17, R9
+       MULLD   R5, R20, R18    // low x[i+3]*y
+       MULHDU  R5, R20, R19    // high x[i+3]*y
+       ADDC    R21, R18
+       ADDZE   R19
+       ADDC    R9, R18
+       ADDZE   R19, R9
+       MOVD    R10, 0(R3)      // z[i]
+       MOVD    R14, 8(R3)      // z[i+1]
+       MOVD    R16, 16(R3)     // z[i+2]
+       MOVD    R18, 24(R3)     // z[i+3]
+       ADD     $32, R3
+       ADD     $32, R4
+       BDNZ    loop
 
 done:
-       MOVD R4, c+24(FP)
+       MOVD    R9, c+24(FP)
        RET
index 9512a12270d42386db75bcca97e21741cec40f9b..c483e252ab5ceb87d3475bb932ef50f6ac1f1e12 100644 (file)
@@ -599,33 +599,80 @@ done:
 
 // func addMulVVW(z, x []Word, y Word) (c Word)
 TEXT ·addMulVVW(SB), NOSPLIT, $0
-       MOVD z+0(FP), R10       // R10 = z[]
-       MOVD x+24(FP), R8       // R8 = x[]
-       MOVD y+48(FP), R9       // R9 = y
-       MOVD z_len+8(FP), R22   // R22 = z_len
-
-       MOVD R0, R3             // R3 will be the index register
-       CMP  R0, R22
-       MOVD R0, R4             // R4 = c = 0
-       MOVD R22, CTR           // Initialize loop counter
-       BEQ  done
-       PCALIGN $16
+       MOVD    z+0(FP), R3     // R3 = z[]
+       MOVD    x+24(FP), R4    // R4 = x[]
+       MOVD    y+48(FP), R5    // R5 = y
+       MOVD    z_len+8(FP), R6 // R6 = z_len
+
+       CMP     R6, $4
+       MOVD    R0, R9          // R9 = c = 0
+       BLT     tail
+       SRD     $2, R6, R7
+       MOVD    R7, CTR         // Initialize loop counter
+       PCALIGN $16
 
 loop:
-       MOVD  (R8)(R3), R20     // Load x[i]
-       MOVD  (R10)(R3), R21    // Load z[i]
-       MULLD  R9, R20, R6      // R6 = Low-order(x[i]*y)
-       MULHDU R9, R20, R7      // R7 = High-order(x[i]*y)
-       ADDC   R21, R6          // R6 = z0
-       ADDZE  R7               // R7 = z1
-       ADDC   R4, R6           // R6 = z0 + c + 0
-       ADDZE  R7, R4           // c += z1
-       MOVD   R6, (R10)(R3)    // Store z[i]
-       ADD    $8, R3
-       BC  16, 0, loop         // bdnz
+       MOVD    0(R4), R14      // x[i]
+       MOVD    8(R4), R16      // x[i+1]
+       MOVD    16(R4), R18     // x[i+2]
+       MOVD    24(R4), R20     // x[i+3]
+       MOVD    0(R3), R15      // z[i]
+       MOVD    8(R3), R17      // z[i+1]
+       MOVD    16(R3), R19     // z[i+2]
+       MOVD    24(R3), R21     // z[i+3]
+       MULLD   R5, R14, R10    // low x[i]*y
+       MULHDU  R5, R14, R11    // high x[i]*y
+       ADDC    R15, R10
+       ADDZE   R11
+       ADDC    R9, R10
+       ADDZE   R11, R9
+       MULLD   R5, R16, R14    // low x[i+1]*y
+       MULHDU  R5, R16, R15    // high x[i+1]*y
+       ADDC    R17, R14
+       ADDZE   R15
+       ADDC    R9, R14
+       ADDZE   R15, R9
+       MULLD   R5, R18, R16    // low x[i+2]*y
+       MULHDU  R5, R18, R17    // high x[i+2]*y
+       ADDC    R19, R16
+       ADDZE   R17
+       ADDC    R9, R16
+       ADDZE   R17, R9
+       MULLD   R5, R20, R18    // low x[i+3]*y
+       MULHDU  R5, R20, R19    // high x[i+3]*y
+       ADDC    R21, R18
+       ADDZE   R19
+       ADDC    R9, R18
+       ADDZE   R19, R9
+       MOVD    R10, 0(R3)      // z[i]
+       MOVD    R14, 8(R3)      // z[i+1]
+       MOVD    R16, 16(R3)     // z[i+2]
+       MOVD    R18, 24(R3)     // z[i+3]
+       ADD     $32, R3
+       ADD     $32, R4
+       BDNZ    loop
+
+       ANDCC   $3, R6
+tail:
+       CMP     R0, R6
+       BEQ     done
+       MOVD    R6, CTR
+       PCALIGN $16
+tailloop:
+       MOVD    0(R4), R14
+       MOVD    0(R3), R15
+       MULLD   R5, R14, R10
+       MULHDU  R5, R14, R11
+       ADDC    R15, R10
+       ADDZE   R11
+       ADDC    R9, R10
+       ADDZE   R11, R9
+       MOVD    R10, 0(R3)
+       ADD     $8, R3
+       ADD     $8, R4
+       BDNZ    tailloop
 
 done:
-       MOVD R4, c+56(FP)
+       MOVD    R9, c+56(FP)
        RET
 
-