From: Archana R Date: Wed, 9 Feb 2022 13:37:12 +0000 (-0600) Subject: math/big: Implement shlVU and shrVU in ASM for PPC64 X-Git-Tag: go1.19beta1~682 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6183920a33c21725ad21d67bee8c1eebb5d30a90;p=gostls13.git math/big: Implement shlVU and shrVU in ASM for PPC64 Currently the shift left and shift right functions are coded in .go on PPC64. Implementing them in ASM just like AMD and ARM results in overall speedup of shift benchmarks on POWER8/9/10. name old time/op new time/op delta NonZeroShifts/1/shrVU 8.50ns ± 0% 5.21ns ± 0% -38.66% NonZeroShifts/1/shlVU 8.85ns ± 1% 5.24ns ± 0% -40.78% NonZeroShifts/2/shrVU 9.16ns ± 0% 5.51ns ± 0% -39.80% NonZeroShifts/2/shlVU 9.24ns ± 2% 5.61ns ± 0% -39.28% NonZeroShifts/3/shrVU 10.6ns ± 0% 6.8ns ± 0% -35.78% NonZeroShifts/3/shlVU 10.7ns ± 2% 6.4ns ± 0% -40.82% NonZeroShifts/4/shrVU 12.4ns ± 0% 7.7ns ± 0% -38.12% NonZeroShifts/4/shlVU 12.3ns ± 1% 7.5ns ± 0% -38.67% NonZeroShifts/5/shrVU 13.2ns ± 0% 8.5ns ± 0% -35.51% NonZeroShifts/5/shlVU 13.3ns ± 2% 9.3ns ± 0% -30.05% NonZeroShifts/10/shrVU 16.5ns ± 0% 13.1ns ± 0% -20.12% NonZeroShifts/10/shlVU 16.8ns ± 1% 14.1ns ± 0% -16.02% NonZeroShifts/100/shrVU 122ns ± 0% 94ns ± 0% -22.87% NonZeroShifts/100/shlVU 115ns ± 0% 103ns ± 0% -10.50% NonZeroShifts/1000/shrVU 1.10µs ± 0% 0.91µs ± 0% -17.03% NonZeroShifts/1000/shlVU 1.02µs ± 0% 0.93µs ± 0% -8.74% NonZeroShifts/10000/shrVU 10.9µs ± 0% 9.1µs ± 0% -16.66% NonZeroShifts/10000/shlVU 10.1µs ± 0% 9.3µs ± 0% -8.19% NonZeroShifts/100000/shrVU 109µs ± 0% 91µs ± 0% -16.01% NonZeroShifts/100000/shlVU 101µs ± 0% 94µs ± 0% -7.16% Change-Id: Ia31951cc29a4169beb494d2951427cbe1e963b11 Reviewed-on: https://go-review.googlesource.com/c/go/+/384474 Reviewed-by: Cherry Mui Reviewed-by: Lynn Boger Run-TryBot: Lynn Boger TryBot-Result: Gopher Robot Run-TryBot: Russ Cox Auto-Submit: Russ Cox Reviewed-by: Ian Lance Taylor --- diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index 68c6286494..601cafe6bb 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -346,11 +346,161 @@ done: MOVD R4, c+56(FP) RET +//func shlVU(z, x []Word, s uint) (c Word) TEXT ·shlVU(SB), NOSPLIT, $0 - BR ·shlVU_g(SB) + MOVD z+0(FP), R3 + MOVD x+24(FP), R6 + MOVD s+48(FP), R9 + MOVD z_len+8(FP), R4 + MOVD x_len+32(FP), R7 + CMP R9, R0 // s==0 copy(z,x) + BEQ zeroshift + CMP R4, R0 // len(z)==0 return + BEQ done + + ADD $-1, R4, R5 // len(z)-1 + SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) + SLD $3, R5, R7 + ADD R6, R7, R15 // save starting address &x[len(z)-1] + ADD R3, R7, R16 // save starting address &z[len(z)-1] + MOVD (R6)(R7), R14 + SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7 + CMP R5, R0 // iterate from i=len(z)-1 to 0 + BEQ loopexit // Already at end? + MOVD 0(R15),R10 // x[i] +shloop: + SLD R9, R10, R10 // x[i]<>ŝ + OR R11, R10, R10 + MOVD R10, 0(R16) // z[i-1]=x[i]<>ŝ + MOVD R14, R10 // reuse x[i-1] for next iteration + ADD $-8, R16 // i-- + CMP R15, R6 // &x[i-1]>&x[0]? + BGT shloop +loopexit: + MOVD 0(R6), R4 + SLD R9, R4, R4 + MOVD R4, 0(R3) // z[0]=x[0]<>ŝ into c + RET + +zeroshift: + CMP R6, R0 // x is null, nothing to copy + BEQ done + CMP R6, R3 // if x is same as z, nothing to copy + BEQ done + CMP R7, R4 + ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z + SLD $3, R7, R7 + SUB R6, R3, R11 // dest - src + CMPU R11, R7, CR2 // < len? + BLT CR2, backward // there is overlap, copy backwards + MOVD $0, R14 + // shlVU processes backwards, but added a forward copy option + // since its faster on POWER +repeat: + MOVD (R6)(R14), R15 // Copy 8 bytes at a time + MOVD R15, (R3)(R14) + ADD $8, R14 + CMP R14, R7 // More 8 bytes left? + BLT repeat + BR done +backward: + ADD $-8,R7, R14 +repeatback: + MOVD (R6)(R14), R15 // copy x into z backwards + MOVD R15, (R3)(R14) // copy 8 bytes at a time + SUB $8, R14 + CMP R14, $-8 // More 8 bytes left? + BGT repeatback + +done: + MOVD R0, c+56(FP) // c=0 + RET +//func shrVU(z, x []Word, s uint) (c Word) TEXT ·shrVU(SB), NOSPLIT, $0 - BR ·shrVU_g(SB) + MOVD z+0(FP), R3 + MOVD x+24(FP), R6 + MOVD s+48(FP), R9 + MOVD z_len+8(FP), R4 + MOVD x_len+32(FP), R7 + + CMP R9, R0 // s==0, copy(z,x) + BEQ zeroshift + CMP R4, R0 // len(z)==0 return + BEQ done + SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) + + MOVD 0(R6), R7 + SLD R5, R7, R7 // compute x[0]<<ŝ + MOVD $1, R8 // iterate from i=1 to i=3, else jump to scalar loop + CMP R4, $3 + BLT scalar + MTVSRD R9, VS38 // s + VSPLTB $7, V6, V4 + MTVSRD R5, VS39 // ŝ + VSPLTB $7, V7, V2 + ADD $-2, R4, R16 + PCALIGN $16 +loopback: + ADD $-1, R8, R10 + SLD $3, R10 + LXVD2X (R6)(R10), VS32 // load x[i-1], x[i] + SLD $3, R8, R12 + LXVD2X (R6)(R12), VS33 // load x[i], x[i+1] + + VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s + VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ + VOR V3, V5, V5 // Or(|) the two registers together + STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i] + ADD $2, R8 // Done processing 2 entries, i and i+1 + CMP R8, R16 // Are there at least a couple of more entries left? + BLE loopback + CMP R8, R4 // Are we at the last element? + BEQ loopexit +scalar: + ADD $-1, R8, R10 + SLD $3, R10 + MOVD (R6)(R10),R11 + SRD R9, R11, R11 // x[len(z)-2] >> s + SLD $3, R8, R12 + MOVD (R6)(R12), R12 + SLD R5, R12, R12 // x[len(z)-1]<<ŝ + OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ + MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ +loopexit: + ADD $-1, R4 + SLD $3, R4 + MOVD (R6)(R4), R5 + SRD R9, R5, R5 // x[len(z)-1]>>s + MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s + MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c + RET + +zeroshift: + CMP R6, R0 // x is null, nothing to copy + BEQ done + CMP R6, R3 // if x is same as z, nothing to copy + BEQ done + CMP R7, R4 + ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z + SLD $3, R7, R7 + MOVD $0, R14 +repeat: + MOVD (R6)(R14), R15 // copy 8 bytes at a time + MOVD R15, (R3)(R14) // shrVU processes bytes only forwards + ADD $8, R14 + CMP R14, R7 // More 8 bytes left? + BLT repeat +done: + MOVD R0, c+56(FP) + RET // func mulAddVWW(z, x []Word, y, r Word) (c Word) TEXT ·mulAddVWW(SB), NOSPLIT, $0