From 1e28dce80ad2ec195d55269266c5cca7ebd845a5 Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Thu, 13 Oct 2016 12:59:07 -0500 Subject: [PATCH] bytes: improve performance for bytes.Compare on ppc64x This improves the performance for byte.Compare by rewriting the cmpbody function in runtime/asm_ppc64x.s. The previous code had a simple loop which loaded a pair of bytes and compared them, which is inefficient for long buffers. The updated function checks for 8 or 32 byte chunks and then loads and compares double words where possible. Because the byte.Compare result indicates greater or less than, the doubleword loads must take endianness into account, using a byte reversed load in the little endian case. Fixes #17433 benchmark old ns/op new ns/op delta BenchmarkBytesCompare/8-16 13.6 7.16 -47.35% BenchmarkBytesCompare/16-16 25.7 7.83 -69.53% BenchmarkBytesCompare/32-16 38.1 7.78 -79.58% BenchmarkBytesCompare/64-16 63.0 10.6 -83.17% BenchmarkBytesCompare/128-16 112 13.0 -88.39% BenchmarkBytesCompare/256-16 211 28.1 -86.68% BenchmarkBytesCompare/512-16 410 38.6 -90.59% BenchmarkBytesCompare/1024-16 807 60.2 -92.54% BenchmarkBytesCompare/2048-16 1601 103 -93.57% Change-Id: I121acc74fcd27c430797647b8d682eb0607c63eb Reviewed-on: https://go-review.googlesource.com/30949 Reviewed-by: David Chase --- src/cmd/internal/obj/ppc64/a.out.go | 1 + src/cmd/internal/obj/ppc64/anames.go | 1 + src/cmd/internal/obj/ppc64/asm9.go | 3 + src/runtime/asm_ppc64x.s | 268 ++++++++++++++++++++++----- 4 files changed, 225 insertions(+), 48 deletions(-) diff --git a/src/cmd/internal/obj/ppc64/a.out.go b/src/cmd/internal/obj/ppc64/a.out.go index 556ea37608..ac1126e8f2 100644 --- a/src/cmd/internal/obj/ppc64/a.out.go +++ b/src/cmd/internal/obj/ppc64/a.out.go @@ -424,6 +424,7 @@ const ( ALSW ALWAR ALWSYNC + AMOVDBR AMOVWBR AMOVB AMOVBU diff --git a/src/cmd/internal/obj/ppc64/anames.go b/src/cmd/internal/obj/ppc64/anames.go index 1d766a2d6b..9e26666dbe 100644 --- a/src/cmd/internal/obj/ppc64/anames.go +++ b/src/cmd/internal/obj/ppc64/anames.go @@ -125,6 +125,7 @@ var Anames = []string{ "LSW", "LWAR", "LWSYNC", + "MOVDBR", "MOVWBR", "MOVB", "MOVBU", diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go index d36e32ff0d..7c9d83a7d1 100644 --- a/src/cmd/internal/obj/ppc64/asm9.go +++ b/src/cmd/internal/obj/ppc64/asm9.go @@ -1513,6 +1513,7 @@ func buildop(ctxt *obj.Link) { case AMOVHBR: opset(AMOVWBR, r0) + opset(AMOVDBR, r0) case ASLBMFEE: opset(ASLBMFEV, r0) @@ -3923,6 +3924,8 @@ func oploadx(ctxt *obj.Link, a obj.As) uint32 { return OPVCC(31, 790, 0, 0) /* lhbrx */ case AMOVWBR: return OPVCC(31, 534, 0, 0) /* lwbrx */ + case AMOVDBR: + return OPVCC(31, 532, 0, 0) /* ldbrx */ case AMOVHZ: return OPVCC(31, 279, 0, 0) /* lhzx */ case AMOVHZU: diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s index b5cd12bb3c..8b5ea45082 100644 --- a/src/runtime/asm_ppc64x.s +++ b/src/runtime/asm_ppc64x.s @@ -812,13 +812,220 @@ eq: MOVB R3, ret+16(FP) RET -// Do an efficieint memequal for ppc64 -// for reuse where possible. +// Do an efficient memcmp for ppc64le +// R3 = s1 len +// R4 = s2 len +// R5 = s1 addr +// R6 = s2 addr +// R7 = addr of return value +TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3,R8 // set up length + CMP R3,R4,CR2 // unequal? + BC 12,8,setuplen // BLT CR2 + MOVD R4,R8 // use R4 for comparison len +setuplen: + MOVD R8,CTR // set up loop counter + CMP R8,$8 // only optimize >=8 + BLT simplecheck + DCBT (R5) // cache hint + DCBT (R6) + CMP R8,$32 // optimize >= 32 + MOVD R8,R9 + BLT setup8a // 8 byte moves only +setup32a: + SRADCC $5,R8,R9 // number of 32 byte chunks + MOVD R9,CTR + + // Special processing for 32 bytes or longer. + // Loading this way is faster and correct as long as the + // doublewords being compared are equal. Once they + // are found unequal, reload them in proper byte order + // to determine greater or less than. +loop32a: + MOVD 0(R5),R9 // doublewords to compare + MOVD 0(R6),R10 // get 4 doublewords + MOVD 8(R5),R14 + MOVD 8(R6),R15 + CMPU R9,R10 // bytes equal? + MOVD $0,R16 // set up for cmpne + BNE cmpne // further compare for LT or GT + MOVD 16(R5),R9 // get next pair of doublewords + MOVD 16(R6),R10 + CMPU R4,R15 // bytes match? + MOVD $8,R16 // set up for cmpne + BNE cmpne // further compare for LT or GT + MOVD 24(R5),R14 // get next pair of doublewords + MOVD 24(R6),R15 + CMPU R9,R10 // bytes match? + MOVD $16,R16 // set up for cmpne + BNE cmpne // further compare for LT or GT + MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32 + ADD $32,R5 // bump up to next 32 + ADD $32,R6 + CMPU R14,R15 // bytes match? + BC 8,2,loop32a // br ctr and cr + BNE cmpne + ANDCC $24,R8,R9 // Any 8 byte chunks? + BEQ leftover // and result is 0 +setup8a: + SRADCC $3,R9,R9 // get the 8 byte count + BEQ leftover // shifted value is 0 + MOVD R9,CTR // loop count for doublewords +loop8: + MOVDBR (R5+R0),R9 // doublewords to compare + MOVDBR (R6+R0),R10 // LE compare order + ADD $8,R5 + ADD $8,R6 + CMPU R9,R10 // match? + BC 8,2,loop8 // bt ctr <> 0 && cr + BGT greater + BLT less +leftover: + ANDCC $7,R8,R9 // check for leftover bytes + MOVD R9,CTR // save the ctr + BNE simple // leftover bytes + BC 12,10,equal // test CR2 for length comparison + BC 12,8,less + BR greater +simplecheck: + CMP R8,$0 // remaining compare length 0 + BNE simple // do simple compare + BC 12,10,equal // test CR2 for length comparison + BC 12,8,less // 1st len < 2nd len, result less + BR greater // 1st len > 2nd len must be greater +simple: + MOVBZ 0(R5), R9 // get byte from 1st operand + ADD $1,R5 + MOVBZ 0(R6), R10 // get byte from 2nd operand + ADD $1,R6 + CMPU R9, R10 + BC 8,2,simple // bc ctr <> 0 && cr + BGT greater // 1st > 2nd + BLT less // 1st < 2nd + BC 12,10,equal // test CR2 for length comparison + BC 12,9,greater // 2nd len > 1st len + BR less // must be less +cmpne: // only here is not equal + MOVDBR (R5+R16),R8 // reload in reverse order + MOVDBR (R6+R16),R9 + CMPU R8,R9 // compare correct endianness + BGT greater // here only if NE +less: + MOVD $-1,R3 + MOVD R3,(R7) // return value if A < B + RET +equal: + MOVD $0,(R7) // return value if A == B + RET +greater: + MOVD $1,R3 + MOVD R3,(R7) // return value if A > B + RET + +// Do an efficient memcmp for ppc64 (BE) +// R3 = s1 len +// R4 = s2 len +// R5 = s1 addr +// R6 = s2 addr +// R7 = addr of return value +TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3,R8 // set up length + CMP R3,R4,CR2 // unequal? + BC 12,8,setuplen // BLT CR2 + MOVD R4,R8 // use R4 for comparison len +setuplen: + MOVD R8,CTR // set up loop counter + CMP R8,$8 // only optimize >=8 + BLT simplecheck + DCBT (R5) // cache hint + DCBT (R6) + CMP R8,$32 // optimize >= 32 + MOVD R8,R9 + BLT setup8a // 8 byte moves only + +setup32a: + SRADCC $5,R8,R9 // number of 32 byte chunks + MOVD R9,CTR +loop32a: + MOVD 0(R5),R9 // doublewords to compare + MOVD 0(R6),R10 // get 4 doublewords + MOVD 8(R5),R14 + MOVD 8(R6),R15 + CMPU R9,R10 // bytes equal? + BLT less // found to be less + BGT greater // found to be greater + MOVD 16(R5),R9 // get next pair of doublewords + MOVD 16(R6),R10 + CMPU R14,R15 // bytes match? + BLT less // found less + BGT greater // found greater + MOVD 24(R5),R14 // get next pair of doublewords + MOVD 24(R6),R15 + CMPU R9,R10 // bytes match? + BLT less // found to be less + BGT greater // found to be greater + ADD $32,R5 // bump up to next 32 + ADD $32,R6 + CMPU R14,R15 // bytes match? + BC 8,2,loop32a // br ctr and cr + BLT less // with BE, byte ordering is + BGT greater // good for compare + ANDCC $24,R8,R9 // Any 8 byte chunks? + BEQ leftover // and result is 0 +setup8a: + SRADCC $3,R9,R9 // get the 8 byte count + BEQ leftover // shifted value is 0 + MOVD R9,CTR // loop count for doublewords +loop8: + MOVD (R5),R9 + MOVD (R6),R10 + ADD $8,R5 + ADD $8,R6 + CMPU R9,R10 // match? + BC 8,2,loop8 // bt ctr <> 0 && cr + BGT greater + BLT less +leftover: + ANDCC $7,R8,R9 // check for leftover bytes + MOVD R9,CTR // save the ctr + BNE simple // leftover bytes + BC 12,10,equal // test CR2 for length comparison + BC 12,8,less + BR greater +simplecheck: + CMP R8,$0 // remaining compare length 0 + BNE simple // do simple compare + BC 12,10,equal // test CR2 for length comparison + BC 12,8,less // 1st len < 2nd len, result less + BR greater // same len, must be equal +simple: + MOVBZ 0(R5),R9 // get byte from 1st operand + ADD $1,R5 + MOVBZ 0(R6),R10 // get byte from 2nd operand + ADD $1,R6 + CMPU R9,R10 + BC 8,2,simple // bc ctr <> 0 && cr + BGT greater // 1st > 2nd + BLT less // 1st < 2nd + BC 12,10,equal // test CR2 for length comparison + BC 12,9,greater // 2nd len > 1st len +less: + MOVD $-1,R3 + MOVD R3,(R7) // return value if A < B + RET +equal: + MOVD $0,(R7) // return value if A == B + RET +greater: + MOVD $1,R3 + MOVD R3,(R7) // return value if A > B + RET + +// Do an efficient memequal for ppc64 // R3 = s1 // R4 = s2 // R5 = len // R9 = return value -// R6, R7 clobbered TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0 MOVD R5,CTR CMP R5,$8 // only optimize >=8 @@ -983,7 +1190,11 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 MOVD s2_base+16(FP), R6 MOVD s2_len+24(FP), R4 MOVD $ret+32(FP), R7 - BR runtime·cmpbody<>(SB) +#ifdef GOARCH_ppc64le + BR cmpbodyLE<>(SB) +#else + BR cmpbodyBE<>(SB) +#endif TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56 MOVD s1+0(FP), R5 @@ -991,50 +1202,11 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56 MOVD s2+24(FP), R6 MOVD s2+32(FP), R4 MOVD $ret+48(FP), R7 - BR runtime·cmpbody<>(SB) - -// On entry: -// R3 is the length of s1 -// R4 is the length of s2 -// R5 points to the start of s1 -// R6 points to the start of s2 -// R7 points to return value (-1/0/1 will be written here) -// -// On exit: -// R5, R6, R8, R9 and R10 are clobbered -TEXT runtime·cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 - CMP R5, R6 - BEQ samebytes // same starting pointers; compare lengths - SUB $1, R5 - SUB $1, R6 - MOVD R4, R8 - CMP R3, R4 - BGE 2(PC) - MOVD R3, R8 // R8 is min(R3, R4) - ADD R5, R8 // R5 is current byte in s1, R8 is last byte in s1 to compare -loop: - CMP R5, R8 - BEQ samebytes // all compared bytes were the same; compare lengths - MOVBZU 1(R5), R9 - MOVBZU 1(R6), R10 - CMP R9, R10 - BEQ loop - // bytes differed - MOVD $1, R4 - BGT 2(PC) - NEG R4 - MOVD R4, (R7) - RET -samebytes: - MOVD $1, R8 - CMP R3, R4 - BNE 3(PC) - MOVD R0, (R7) - RET - BGT 2(PC) - NEG R8 - MOVD R8, (R7) - RET +#ifdef GOARCH_ppc64le + BR cmpbodyLE<>(SB) +#else + BR cmpbodyBE<>(SB) +#endif TEXT runtime·fastrand(SB), NOSPLIT, $0-4 MOVD g_m(g), R4 -- 2.48.1