From fc8967e3844431665169bfae001f0de454be5bb2 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Seo Date: Tue, 27 Mar 2018 23:38:32 -0400 Subject: [PATCH] math/big: improve performance on ppc64x by unrolling loops This change improves performance of addVV, subVV and mulAddVWW by unrolling the loops, with improvements up to 1.45x. benchmark old ns/op new ns/op delta BenchmarkAddVV/1-16 5.79 5.85 +1.04% BenchmarkAddVV/2-16 6.41 6.62 +3.28% BenchmarkAddVV/3-16 6.89 7.35 +6.68% BenchmarkAddVV/4-16 7.47 8.26 +10.58% BenchmarkAddVV/5-16 8.04 8.18 +1.74% BenchmarkAddVV/10-16 10.9 11.2 +2.75% BenchmarkAddVV/100-16 81.7 57.0 -30.23% BenchmarkAddVV/1000-16 714 500 -29.97% BenchmarkAddVV/10000-16 7088 4946 -30.22% BenchmarkAddVV/100000-16 71514 49364 -30.97% BenchmarkSubVV/1-16 5.94 5.89 -0.84% BenchmarkSubVV/2-16 12.9 6.82 -47.13% BenchmarkSubVV/3-16 7.03 7.34 +4.41% BenchmarkSubVV/4-16 7.58 8.23 +8.58% BenchmarkSubVV/5-16 8.15 8.19 +0.49% BenchmarkSubVV/10-16 11.2 11.4 +1.79% BenchmarkSubVV/100-16 82.4 57.0 -30.83% BenchmarkSubVV/1000-16 715 499 -30.21% BenchmarkSubVV/10000-16 7089 4947 -30.22% BenchmarkSubVV/100000-16 71568 49378 -31.01% benchmark old MB/s new MB/s speedup BenchmarkAddVV/1-16 11048.49 10939.92 0.99x BenchmarkAddVV/2-16 19973.41 19323.60 0.97x BenchmarkAddVV/3-16 27847.09 26123.06 0.94x BenchmarkAddVV/4-16 34276.46 30976.54 0.90x BenchmarkAddVV/5-16 39781.92 39140.68 0.98x BenchmarkAddVV/10-16 58559.29 56894.68 0.97x BenchmarkAddVV/100-16 78354.88 112243.69 1.43x BenchmarkAddVV/1000-16 89592.74 127889.04 1.43x BenchmarkAddVV/10000-16 90292.39 129387.06 1.43x BenchmarkAddVV/100000-16 89492.92 129647.78 1.45x BenchmarkSubVV/1-16 10781.03 10861.22 1.01x BenchmarkSubVV/2-16 9949.27 18760.21 1.89x BenchmarkSubVV/3-16 27319.40 26166.01 0.96x BenchmarkSubVV/4-16 33764.35 31123.02 0.92x BenchmarkSubVV/5-16 39272.40 39050.31 0.99x BenchmarkSubVV/10-16 57262.87 56206.33 0.98x BenchmarkSubVV/100-16 77641.78 112280.86 1.45x BenchmarkSubVV/1000-16 89486.27 128064.08 1.43x BenchmarkSubVV/10000-16 90274.37 129356.59 1.43x BenchmarkSubVV/100000-16 89424.42 129610.50 1.45x Change-Id: I2795a82134d1e3b75e2634c76b8ca165a723ec7b Reviewed-on: https://go-review.googlesource.com/103495 Run-TryBot: Lynn Boger TryBot-Result: Gobot Gobot Reviewed-by: Lynn Boger --- src/math/big/arith_ppc64x.s | 293 +++++++++++++++++++++++++++++------- 1 file changed, 237 insertions(+), 56 deletions(-) diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index b3ac91e35e..dbb168a376 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -22,65 +22,175 @@ TEXT ·mulWW(SB), NOSPLIT, $0 // func addVV(z, y, y []Word) (c Word) // z[i] = x[i] + y[i] for all i, carrying TEXT ·addVV(SB), NOSPLIT, $0 - MOVD z_len+8(FP), R7 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD z+0(FP), R10 - - MOVD R0, R4 - MOVD R0, R6 // R6 will be the address index - ADDC R4, R4 // clear CA - MOVD R7, CTR + MOVD z_len+8(FP), R7 // R7 = z_len + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y[] + MOVD z+0(FP), R10 // R10 = z[] + // If z_len = 0, we are done CMP R0, R7 + MOVD R0, R4 BEQ done + // Process the first iteration out of the loop so we can + // use MOVDU and avoid 3 index registers updates. + MOVD 0(R8), R11 // R11 = x[i] + MOVD 0(R9), R12 // R12 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA + CMP R0, R7 + MOVD R15, 0(R10) // z[i] + BEQ final // If z_len was 1, we are done + + SRD $2, R7, R5 // R5 = z_len/4 + CMP R0, R5 + MOVD R5, CTR // Set up loop counter + BEQ tail // If R5 = 0, we can't use the loop + + // Process 4 elements per iteration. Unrolling this loop + // means a performance trade-off: we will lose performance + // for small values of z_len (0.90x in the worst case), but + // gain significant performance as z_len increases (up to + // 1.45x). loop: - MOVD (R8)(R6), R11 // x[i] - MOVD (R9)(R6), R12 // y[i] - ADDE R12, R11, R15 // x[i] + y[i] + CA - MOVD R15, (R10)(R6) // z[i] + MOVD 8(R8), R11 // R11 = x[i] + MOVD 16(R8), R12 // R12 = x[i+1] + MOVD 24(R8), R14 // R14 = x[i+2] + MOVDU 32(R8), R15 // R15 = x[i+3] + MOVD 8(R9), R16 // R16 = y[i] + MOVD 16(R9), R17 // R17 = y[i+1] + MOVD 24(R9), R18 // R18 = y[i+2] + MOVDU 32(R9), R19 // R19 = y[i+3] + ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA + ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA + ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA + ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA + MOVD R20, 8(R10) // z[i] + MOVD R21, 16(R10) // z[i+1] + MOVD R22, 24(R10) // z[i+2] + MOVDU R23, 32(R10) // z[i+3] + ADD $-4, R7 // R7 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have more elements to read + CMP R0, R7 + BEQ final - ADD $8, R6 - BC 16, 0, loop // bdnz + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R11 // R11 = x[i] + MOVDU 8(R9), R16 // R16 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA + CMP R0, R7 + MOVDU R20, 8(R10) // z[i] + BEQ final // If R7 = 0, we are done + + MOVDU 8(R8), R11 + MOVDU 8(R9), R16 + ADD $-1, R7 + ADDE R11, R16, R20 + CMP R0, R7 + MOVDU R20, 8(R10) + BEQ final + + MOVD 8(R8), R11 + MOVD 8(R9), R16 + ADDE R11, R16, R20 + MOVD R20, 8(R10) + +final: + ADDZE R4 // Capture CA done: - ADDZE R4 MOVD R4, c+72(FP) RET // func subVV(z, x, y []Word) (c Word) // z[i] = x[i] - y[i] for all i, carrying TEXT ·subVV(SB), NOSPLIT, $0 - MOVD z_len+8(FP), R7 - MOVD x+24(FP), R8 - MOVD y+48(FP), R9 - MOVD z+0(FP), R10 - - MOVD R0, R4 // c = 0 - MOVD R0, R6 - SUBC R0, R0 // clear CA - MOVD R7, CTR + MOVD z_len+8(FP), R7 // R7 = z_len + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y[] + MOVD z+0(FP), R10 // R10 = z[] - CMP R0, R7 - BEQ sublend + // If z_len = 0, we are done + CMP R0, R7 + MOVD R0, R4 + BEQ done -// amd64 saves and restores CF, but I believe they only have to do that because all of -// their math operations clobber it - we should just be able to recover it at the end. -subloop: - MOVD (R8)(R6), R11 // x[i] - MOVD (R9)(R6), R12 // y[i] + // Process the first iteration out of the loop so we can + // use MOVDU and avoid 3 index registers updates. + MOVD 0(R8), R11 // R11 = x[i] + MOVD 0(R9), R12 // R12 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA + CMP R0, R7 + MOVD R15, 0(R10) // z[i] + BEQ final // If z_len was 1, we are done + + SRD $2, R7, R5 // R5 = z_len/4 + CMP R0, R5 + MOVD R5, CTR // Set up loop counter + BEQ tail // If R5 = 0, we can't use the loop + + // Process 4 elements per iteration. Unrolling this loop + // means a performance trade-off: we will lose performance + // for small values of z_len (0.92x in the worst case), but + // gain significant performance as z_len increases (up to + // 1.45x). +loop: + MOVD 8(R8), R11 // R11 = x[i] + MOVD 16(R8), R12 // R12 = x[i+1] + MOVD 24(R8), R14 // R14 = x[i+2] + MOVDU 32(R8), R15 // R15 = x[i+3] + MOVD 8(R9), R16 // R16 = y[i] + MOVD 16(R9), R17 // R17 = y[i+1] + MOVD 24(R9), R18 // R18 = y[i+2] + MOVDU 32(R9), R19 // R19 = y[i+3] + SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA + SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA + SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA + SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA + MOVD R20, 8(R10) // z[i] + MOVD R21, 16(R10) // z[i+1] + MOVD R22, 24(R10) // z[i+2] + MOVDU R23, 32(R10) // z[i+3] + ADD $-4, R7 // R7 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have more elements to read + CMP R0, R7 + BEQ final - SUBE R12, R11, R15 - MOVD R15, (R10)(R6) + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R11 // R11 = x[i] + MOVDU 8(R9), R16 // R16 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA + CMP R0, R7 + MOVDU R20, 8(R10) // z[i] + BEQ final // If R7 = 0, we are done - ADD $8, R6 - BC 16, 0, subloop // bdnz + MOVDU 8(R8), R11 + MOVDU 8(R9), R16 + ADD $-1, R7 + SUBE R16, R11, R20 + CMP R0, R7 + MOVDU R20, 8(R10) + BEQ final -sublend: + MOVD 8(R8), R11 + MOVD 8(R9), R16 + SUBE R16, R11, R20 + MOVD R20, 8(R10) +final: ADDZE R4 XOR $1, R4 + +done: MOVD R4, c+72(FP) RET @@ -242,30 +352,101 @@ TEXT ·shrVU(SB), NOSPLIT, $0 // func mulAddVWW(z, x []Word, y, r Word) (c Word) TEXT ·mulAddVWW(SB), NOSPLIT, $0 - MOVD z+0(FP), R10 // R10 = z[] - MOVD x+24(FP), R8 // R8 = x[] - MOVD y+48(FP), R9 // R9 = y - MOVD r+56(FP), R4 // R4 = r = c - MOVD z_len+8(FP), R11 // R11 = z_len + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y + MOVD r+56(FP), R4 // R4 = r = c + MOVD z_len+8(FP), R11 // R11 = z_len + + CMP R0, R11 + BEQ done + + MOVD 0(R8), R20 + ADD $-1, R11 + MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) + MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) + ADDC R4, R6 // R6 = z0 + r + ADDZE R7 // R7 = z1 + CA + CMP R0, R11 + MOVD R7, R4 // R4 = c + MOVD R6, 0(R10) // z[i] + BEQ done - MOVD R0, R3 // R3 will be the index register - CMP R0, R11 - MOVD R11, CTR // Initialize loop counter - BEQ done + // We will read 4 elements per iteration + SRD $2, R11, R14 // R14 = z_len/4 + DCBT (R8) + CMP R0, R14 + MOVD R14, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop loop: - MOVD (R8)(R3), R20 // x[i] - MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) - MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) - ADDC R4, R6 // Compute sum for z1 and z0 - ADDZE R7 - MOVD R6, (R10)(R3) // z[i] - MOVD R7, R4 // c - ADD $8, R3 - BC 16, 0, loop // bdnz + MOVD 8(R8), R20 // R20 = x[i] + MOVD 16(R8), R21 // R21 = x[i+1] + MOVD 24(R8), R22 // R22 = x[i+2] + MOVDU 32(R8), R23 // R23 = x[i+3] + MULLD R9, R20, R24 // R24 = z0[i] + MULHDU R9, R20, R20 // R20 = z1[i] + ADDC R4, R24 // R24 = z0[i] + c + ADDZE R20 // R7 = z1[i] + CA + MULLD R9, R21, R25 + MULHDU R9, R21, R21 + ADDC R20, R25 + ADDZE R21 + MULLD R9, R22, R26 + MULHDU R9, R22, R22 + ADDC R21, R26 + ADDZE R22 + MULLD R9, R23, R27 + MULHDU R9, R23, R23 + ADDC R22, R27 + ADDZE R23 + MOVD R24, 8(R10) // z[i] + MOVD R25, 16(R10) // z[i+1] + MOVD R26, 24(R10) // z[i+2] + MOVDU R27, 32(R10) // z[i+3] + MOVD R23, R4 // R4 = c + ADD $-4, R11 // R11 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ done + + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R20 // R20 = x[i] + MULLD R9, R20, R24 // R24 = z0[i] + MULHDU R9, R20, R25 // R25 = z1[i] + ADD $-1, R11 // R11 = z_len - 1 + ADDC R4, R24 + ADDZE R25 + MOVDU R24, 8(R10) // z[i] + CMP R0, R11 + MOVD R25, R4 // R4 = c + BEQ done // If R11 = 0, we are done + + MOVDU 8(R8), R20 + MULLD R9, R20, R24 + MULHDU R9, R20, R25 + ADD $-1, R11 + ADDC R4, R24 + ADDZE R25 + MOVDU R24, 8(R10) + CMP R0, R11 + MOVD R25, R4 + BEQ done + + MOVD 8(R8), R20 + MULLD R9, R20, R24 + MULHDU R9, R20, R25 + ADD $-1, R11 + ADDC R4, R24 + ADDZE R25 + MOVD R24, 8(R10) + MOVD R25, R4 done: - MOVD R4, c+64(FP) + MOVD R4, c+64(FP) RET // func addMulVVW(z, x []Word, y Word) (c Word) -- 2.50.0