From 9459c03b29937d236a8b61e452cb02d01c7b8559 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Seo Date: Tue, 18 Apr 2017 18:20:56 -0300 Subject: [PATCH] math/big: improve performance for addVV/subVV for ppc64x This change adds a better asm implementation of addVV for ppc64x, with speedups up to nearly 3x in the best cases. benchmark old ns/op new ns/op delta BenchmarkAddVV/1-8 7.33 5.81 -20.74% BenchmarkAddVV/2-8 8.72 6.49 -25.57% BenchmarkAddVV/3-8 10.5 7.08 -32.57% BenchmarkAddVV/4-8 12.7 7.57 -40.39% BenchmarkAddVV/5-8 14.3 8.06 -43.64% BenchmarkAddVV/10-8 27.6 11.1 -59.78% BenchmarkAddVV/100-8 218 82.4 -62.20% BenchmarkAddVV/1000-8 2064 718 -65.21% BenchmarkAddVV/10000-8 20536 7153 -65.17% BenchmarkAddVV/100000-8 211004 72403 -65.69% benchmark old MB/s new MB/s speedup BenchmarkAddVV/1-8 8729.74 11006.26 1.26x BenchmarkAddVV/2-8 14683.65 19707.55 1.34x BenchmarkAddVV/3-8 18226.96 27103.63 1.49x BenchmarkAddVV/4-8 20204.50 33805.81 1.67x BenchmarkAddVV/5-8 22348.64 39694.06 1.78x BenchmarkAddVV/10-8 23212.74 57631.08 2.48x BenchmarkAddVV/100-8 29300.07 77629.53 2.65x BenchmarkAddVV/1000-8 31000.56 89094.54 2.87x BenchmarkAddVV/10000-8 31163.61 89469.16 2.87x BenchmarkAddVV/100000-8 30331.16 88393.73 2.91x It also adds the use of CTR for the loop counter in subVV, instead of manually updating the loop counter. This is slightly faster. Change-Id: Ic4b05cad384fd057972d46a5618ed5c3039d7460 Reviewed-on: https://go-review.googlesource.com/41010 Run-TryBot: Lynn Boger TryBot-Result: Gobot Gobot Reviewed-by: Lynn Boger --- src/math/big/arith_ppc64x.s | 49 ++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index ba4e4ab63a..5ed3de68e2 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -19,8 +19,35 @@ TEXT ·mulWW(SB), NOSPLIT, $0 MOVD R7, z0+24(FP) RET +// func addVV(z, y, y []Word) (c Word) +// z[i] = x[i] + y[i] for all i, carrying TEXT ·addVV(SB), NOSPLIT, $0 - BR ·addVV_g(SB) + MOVD z_len+8(FP), R7 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R10 + + MOVD R0, R4 + MOVD R0, R6 // R6 will be the address index + ADDC R4, R4 // clear CA + MOVD R7, CTR + + CMP R0, R7 + BEQ done + +loop: + MOVD (R8)(R6), R11 // x[i] + MOVD (R9)(R6), R12 // y[i] + ADDE R12, R11, R15 // x[i] + y[i] + CA + MOVD R15, (R10)(R6) // z[i] + + ADD $8, R6 + BC 16, 0, loop // bdnz + +done: + ADDZE R4 + MOVD R4, c+72(FP) + RET // func subVV(z, x, y []Word) (c Word) // z[i] = x[i] - y[i] for all i, carrying @@ -30,32 +57,30 @@ TEXT ·subVV(SB), NOSPLIT, $0 MOVD y+48(FP), R9 MOVD z+0(FP), R10 - MOVD $0, R4 // c = 0 - MOVD $0, R5 // i = 0 - MOVD $1, R29 // work around lack of ADDI - MOVD $8, R28 // work around lack of scaled addressing - + MOVD R0, R4 // c = 0 + MOVD R0, R6 SUBC R0, R0 // clear CA - JMP sublend + MOVD R7, CTR + + CMP R0, R7 + BEQ sublend // amd64 saves and restores CF, but I believe they only have to do that because all of // their math operations clobber it - we should just be able to recover it at the end. subloop: - MULLD R5, R28, R6 MOVD (R8)(R6), R11 // x[i] MOVD (R9)(R6), R12 // y[i] SUBE R12, R11, R15 MOVD R15, (R10)(R6) - ADD R29, R5 // i++ + ADD $8, R6 + BC 16, 0, subloop // bdnz sublend: - CMP R5, R7 - BLT subloop ADDZE R4 - XOR R29, R4 + XOR $1, R4 MOVD R4, c+72(FP) RET -- 2.48.1