From baec148767328b9af7442083108466486781e835 Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Mon, 7 Mar 2016 11:59:16 -0600 Subject: [PATCH] bytes: Equal perf improvements on ppc64le/ppc64 The existing implementation for Equal and similar functions in the bytes package operate on one byte at at time. This performs poorly on ppc64/ppc64le especially when the byte buffers are large. This change improves those functions by loading and comparing double words where possible. The common code has been moved to a function that can be shared by the other functions in this file which perform the same type of comparison. Further optimizations are done for the case where >= 32 bytes are being compared. The new function memeqbody is used by memeq_varlen, Equal, and eqstring. When running the bytes test with -test.bench=Equal benchmark old MB/s new MB/s speedup BenchmarkEqual1 164.83 129.49 0.79x BenchmarkEqual6 563.51 445.47 0.79x BenchmarkEqual9 656.15 1099.00 1.67x BenchmarkEqual15 591.93 1024.30 1.73x BenchmarkEqual16 613.25 1914.12 3.12x BenchmarkEqual20 682.37 1687.04 2.47x BenchmarkEqual32 807.96 3843.29 4.76x BenchmarkEqual4K 1076.25 23280.51 21.63x BenchmarkEqual4M 1079.30 13120.14 12.16x BenchmarkEqual64M 1073.28 10876.92 10.13x It was determined that the degradation in the smaller byte tests were due to unfavorable code alignment of the single byte loop. Fixes #14368 Change-Id: I0dd87382c28887c70f4fbe80877a8ba03c31d7cd Reviewed-on: https://go-review.googlesource.com/20249 Reviewed-by: Minux Ma --- src/runtime/asm_ppc64x.s | 178 +++++++++++++++++++++++---------------- 1 file changed, 106 insertions(+), 72 deletions(-) diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s index 59bc8a22dd..8d9d01b104 100644 --- a/src/runtime/asm_ppc64x.s +++ b/src/runtime/asm_ppc64x.s @@ -795,33 +795,13 @@ TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0 TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0 MOVW (R0), R1 -// memequal(p, q unsafe.Pointer, size uintptr) bool -TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 - MOVD a+0(FP), R3 - MOVD b+8(FP), R4 - CMP R3, R4 - BEQ eq - MOVD size+16(FP), R5 - SUB $1, R3 - SUB $1, R4 - ADD R3, R5, R8 -loop: - CMP R3, R8 - BNE test - MOVD $1, R3 - MOVB R3, ret+24(FP) - RET -test: - MOVBZU 1(R3), R6 - MOVBZU 1(R4), R7 - CMP R6, R7 - BEQ loop +TEXT runtime·memequal(SB),NOSPLIT,$0-25 + MOVD a+0(FP), R3 + MOVD b+8(FP), R4 + MOVD size+16(FP), R5 - MOVB R0, ret+24(FP) - RET -eq: - MOVD $1, R1 - MOVB R1, ret+24(FP) + BL runtime·memeqbody(SB) + MOVB R9, ret+24(FP) RET // memequal_varlen(a, b unsafe.Pointer) bool @@ -831,75 +811,129 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 CMP R3, R4 BEQ eq MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure - MOVD R3, FIXED_FRAME+0(R1) - MOVD R4, FIXED_FRAME+8(R1) - MOVD R5, FIXED_FRAME+16(R1) - BL runtime·memequal(SB) - MOVBZ FIXED_FRAME+24(R1), R3 - MOVB R3, ret+16(FP) + BL runtime·memeqbody(SB) + MOVB R9, ret+16(FP) RET eq: MOVD $1, R3 MOVB R3, ret+16(FP) RET +// Do an efficieint memequal for ppc64 +// for reuse where possible. +// R3 = s1 +// R4 = s2 +// R5 = len +// R9 = return value +// R6, R7 clobbered +TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R5,CTR + CMP R5,$8 // only optimize >=8 + BLT simplecheck + DCBT (R3) // cache hint + DCBT (R4) + CMP R5,$32 // optimize >= 32 + MOVD R5,R6 // needed if setup8a branch + BLT setup8a // 8 byte moves only +setup32a: // 8 byte aligned, >= 32 bytes + SRADCC $5,R5,R6 // number of 32 byte chunks to compare + MOVD R6,CTR +loop32a: + MOVD 0(R3),R6 // doublewords to compare + MOVD 0(R4),R7 + MOVD 8(R3),R8 // + MOVD 8(R4),R9 + CMP R6,R7 // bytes batch? + BNE noteq + MOVD 16(R3),R6 + MOVD 16(R4),R7 + CMP R8,R9 // bytes match? + MOVD 24(R3),R8 + MOVD 24(R4),R9 + BNE noteq + CMP R6,R7 // bytes match? + BNE noteq + ADD $32,R3 // bump up to next 32 + ADD $32,R4 + CMP R8,R9 // bytes match? + BC 8,2,loop32a // br ctr and cr + BNE noteq + ANDCC $24,R5,R6 // Any 8 byte chunks? + BEQ leftover // and result is 0 +setup8a: + SRADCC $3,R6,R6 // get the 8 byte count + BEQ leftover // shifted value is 0 + MOVD R6,CTR +loop8: + MOVD 0(R3),R6 // doublewords to compare + ADD $8,R3 + MOVD 0(R4),R7 + ADD $8,R4 + CMP R6,R7 // match? + BC 8,2,loop8 // bt ctr <> 0 && cr + BNE noteq +leftover: + ANDCC $7,R5,R6 // check for leftover bytes + BEQ equal + MOVD R6,CTR + BR simple +simplecheck: + CMP R5,$0 + BEQ equal +simple: + MOVBZ 0(R3), R6 + ADD $1,R3 + MOVBZ 0(R4), R7 + ADD $1,R4 + CMP R6, R7 + BNE noteq + BC 8,2,simple + BNE noteq + BR equal +noteq: + MOVD $0, R9 + RET +equal: + MOVD $1, R9 + RET + // eqstring tests whether two strings are equal. // The compiler guarantees that strings passed // to eqstring have equal length. // See runtime_test.go:eqstring_generic for // equivalent Go code. TEXT runtime·eqstring(SB),NOSPLIT,$0-33 - MOVD s1str+0(FP), R3 - MOVD s2str+16(FP), R4 - MOVD $1, R5 - MOVB R5, ret+32(FP) - CMP R3, R4 - BNE 2(PC) + MOVD s1str+0(FP), R3 + MOVD s2str+16(FP), R4 + MOVD $1, R5 + MOVB R5, ret+32(FP) + CMP R3, R4 + BNE 2(PC) RET - MOVD s1len+8(FP), R5 - SUB $1, R3 - SUB $1, R4 - ADD R3, R5, R8 -loop: - CMP R3, R8 - BNE 2(PC) - RET - MOVBZU 1(R3), R6 - MOVBZU 1(R4), R7 - CMP R6, R7 - BEQ loop - MOVB R0, ret+32(FP) + MOVD s1len+8(FP), R5 + BL runtime·memeqbody(SB) + MOVB R9, ret+32(FP) RET -// TODO: share code with memequal? TEXT bytes·Equal(SB),NOSPLIT,$0-49 - MOVD a_len+8(FP), R3 - MOVD b_len+32(FP), R4 - - CMP R3, R4 // unequal lengths are not equal + MOVD a_len+8(FP), R4 + MOVD b_len+32(FP), R5 + CMP R5, R4 // unequal lengths are not equal BNE noteq + MOVD a+0(FP), R3 + MOVD b+24(FP), R4 + BL runtime·memeqbody(SB) - MOVD a+0(FP), R5 - MOVD b+24(FP), R6 - SUB $1, R5 - SUB $1, R6 - ADD R5, R3 // end-1 - -loop: - CMP R5, R3 - BEQ equal // reached the end - MOVBZU 1(R5), R4 - MOVBZU 1(R6), R7 - CMP R4, R7 - BEQ loop + MOVBZ R9,ret+48(FP) + RET noteq: - MOVBZ R0, ret+48(FP) + MOVBZ $0,ret+48(FP) RET equal: - MOVD $1, R3 - MOVBZ R3, ret+48(FP) + MOVD $1,R3 + MOVBZ R3,ret+48(FP) RET TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 -- 2.48.1