From: fanzha02 Date: Tue, 20 Mar 2018 03:53:21 +0000 (+0000) Subject: internal/bytealg: add optimized Compare for arm64 X-Git-Tag: go1.11beta1~880 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=d31ee64186aa174778b6353ac244b078bb275eb8;p=gostls13.git internal/bytealg: add optimized Compare for arm64 Use LDP instructions to load 16 bytes per loop when the source length is long. Specially process the 8 bytes length, 4 bytes length and 2 bytes length to get a better performance. Benchmark result: name old time/op new time/op delta BytesCompare/1-8 21.0ns ± 0% 10.5ns ± 0% ~ (p=0.079 n=4+5) BytesCompare/2-8 11.5ns ± 0% 10.5ns ± 0% -8.70% (p=0.008 n=5+5) BytesCompare/4-8 13.5ns ± 0% 10.0ns ± 0% -25.93% (p=0.008 n=5+5) BytesCompare/8-8 28.8ns ± 0% 9.5ns ± 0% ~ (p=0.079 n=4+5) BytesCompare/16-8 40.5ns ± 0% 10.5ns ± 0% -74.07% (p=0.008 n=5+5) BytesCompare/32-8 64.6ns ± 0% 12.5ns ± 0% -80.65% (p=0.008 n=5+5) BytesCompare/64-8 112ns ± 0% 16ns ± 0% -85.27% (p=0.008 n=5+5) BytesCompare/128-8 208ns ± 0% 24ns ± 0% -88.22% (p=0.008 n=5+5) BytesCompare/256-8 400ns ± 0% 50ns ± 0% -87.62% (p=0.008 n=5+5) BytesCompare/512-8 785ns ± 0% 82ns ± 0% -89.61% (p=0.008 n=5+5) BytesCompare/1024-8 1.55µs ± 0% 0.14µs ± 0% ~ (p=0.079 n=4+5) BytesCompare/2048-8 3.09µs ± 0% 0.27µs ± 0% ~ (p=0.079 n=4+5) CompareBytesEqual-8 39.0ns ± 0% 12.0ns ± 0% -69.23% (p=0.008 n=5+5) CompareBytesToNil-8 8.57ns ± 5% 8.23ns ± 2% -3.99% (p=0.016 n=5+5) CompareBytesEmpty-8 7.37ns ± 0% 7.36ns ± 4% ~ (p=0.690 n=5+5) CompareBytesIdentical-8 7.39ns ± 0% 7.46ns ± 2% ~ (p=0.667 n=5+5) CompareBytesSameLength-8 17.0ns ± 0% 10.5ns ± 0% -38.24% (p=0.008 n=5+5) CompareBytesDifferentLength-8 17.0ns ± 0% 10.5ns ± 0% -38.24% (p=0.008 n=5+5) CompareBytesBigUnaligned-8 1.58ms ± 0% 0.19ms ± 0% -88.31% (p=0.016 n=4+5) CompareBytesBig-8 1.59ms ± 0% 0.19ms ± 0% -88.27% (p=0.016 n=5+4) CompareBytesBigIdentical-8 7.01ns ± 0% 6.60ns ± 3% -5.91% (p=0.008 n=5+5) name old speed new speed delta CompareBytesBigUnaligned-8 662MB/s ± 0% 5660MB/s ± 0% +755.15% (p=0.016 n=4+5) CompareBytesBig-8 661MB/s ± 0% 5636MB/s ± 0% +752.57% (p=0.016 n=5+4) CompareBytesBigIdentical-8 150TB/s ± 0% 159TB/s ± 3% +6.27% (p=0.008 n=5+5) This is resubmit of CL90175. Change-Id: Ie841daedb3123a68dd2554f27ebef0b3f8a855c2 Reviewed-on: https://go-review.googlesource.com/101635 Run-TryBot: Cherry Zhang Reviewed-by: Cherry Zhang --- diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s index 9b6354715a..2bd38064c3 100644 --- a/src/internal/bytealg/compare_arm64.s +++ b/src/internal/bytealg/compare_arm64.s @@ -10,7 +10,7 @@ TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56 MOVD a_len+8(FP), R0 MOVD b_base+24(FP), R3 MOVD b_len+32(FP), R1 - ADD $56, RSP, R7 + MOVD $ret+48(FP), R7 B cmpbody<>(SB) TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56 @@ -18,7 +18,7 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56 MOVD a_len+8(FP), R0 MOVD b_base+24(FP), R3 MOVD b_len+32(FP), R1 - ADD $56, RSP, R7 + MOVD $ret+48(FP), R7 B cmpbody<>(SB) TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 @@ -26,7 +26,7 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 MOVD a_len+8(FP), R0 MOVD b_base+16(FP), R3 MOVD b_len+24(FP), R1 - ADD $40, RSP, R7 + MOVD $ret+32(FP), R7 B cmpbody<>(SB) // On entry: @@ -37,30 +37,98 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 // R7 points to return value (-1/0/1 will be written here) // // On exit: -// R4, R5, and R6 are clobbered +// R4, R5, R6, R8, R9 and R10 are clobbered TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 CMP R2, R3 - BEQ samebytes // same starting pointers; compare lengths + BEQ samebytes // same starting pointers; compare lengths CMP R0, R1 - CSEL LT, R1, R0, R6 // R6 is min(R0, R1) + CSEL LT, R1, R0, R6 // R6 is min(R0, R1) - ADD R2, R6 // R2 is current byte in a, R6 is last byte in a to compare -loop: - CMP R2, R6 - BEQ samebytes // all compared bytes were the same; compare lengths - MOVBU.P 1(R2), R4 - MOVBU.P 1(R3), R5 + CMP $0, R6 + BEQ samebytes + BIC $0xf, R6, R10 + CBZ R10, small // length < 16 + ADD R2, R10 // end of chunk16 + // length >= 16 +chunk16_loop: + LDP.P 16(R2), (R4, R8) + LDP.P 16(R3), (R5, R9) CMP R4, R5 - BEQ loop - // bytes differed + BNE cmp + CMP R8, R9 + BNE cmpnext + CMP R10, R2 + BNE chunk16_loop + AND $0xf, R6, R6 + CBZ R6, samebytes + SUBS $8, R6 + BLT tail + // the length of tail > 8 bytes + MOVD.P 8(R2), R4 + MOVD.P 8(R3), R5 + CMP R4, R5 + BNE cmp + SUB $8, R6 + // compare last 8 bytes +tail: + MOVD (R2)(R6), R4 + MOVD (R3)(R6), R5 + CMP R4, R5 + BEQ samebytes +cmp: + REV R4, R4 + REV R5, R5 + CMP R4, R5 +ret: MOVD $1, R4 - CSNEG LT, R4, R4, R4 + CNEG HI, R4, R4 MOVD R4, (R7) RET +small: + TBZ $3, R6, lt_8 + MOVD (R2), R4 + MOVD (R3), R5 + CMP R4, R5 + BNE cmp + SUBS $8, R6 + BEQ samebytes + ADD $8, R2 + ADD $8, R3 + SUB $8, R6 + B tail +lt_8: + TBZ $2, R6, lt_4 + MOVWU (R2), R4 + MOVWU (R3), R5 + CMPW R4, R5 + BNE cmp + SUBS $4, R6 + BEQ samebytes + ADD $4, R2 + ADD $4, R3 +lt_4: + TBZ $1, R6, lt_2 + MOVHU (R2), R4 + MOVHU (R3), R5 + CMPW R4, R5 + BNE cmp + ADD $2, R2 + ADD $2, R3 +lt_2: + TBZ $0, R6, samebytes +one: + MOVBU (R2), R4 + MOVBU (R3), R5 + CMPW R4, R5 + BNE ret samebytes: - MOVD $1, R4 - CMP R0, R1 - CSNEG LT, R4, R4, R4 - CSEL EQ, ZR, R4, R4 + CMP R1, R0 + CSET NE, R4 + CNEG LO, R4, R4 MOVD R4, (R7) RET +cmpnext: + REV R8, R4 + REV R9, R5 + CMP R4, R5 + B ret