From: Wei Xiao Date: Wed, 21 Jun 2017 03:30:14 +0000 (+0000) Subject: bytes: add optimized Equal for arm64 X-Git-Tag: go1.10beta1~600 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=78ddf2741f51fdb0f6db4190ef2053a36be91751;p=gostls13.git bytes: add optimized Equal for arm64 Use SIMD instructions when comparing chunks bigger than 16 bytes. Benchmark results of bytes: name old time/op new time/op delta Equal/0-8 6.52ns ± 1% 5.51ns ± 0% -15.43% (p=0.000 n=8+9) Equal/1-8 11.5ns ± 0% 10.5ns ± 0% -8.70% (p=0.000 n=10+10) Equal/6-8 19.0ns ± 0% 13.5ns ± 0% -28.95% (p=0.000 n=10+10) Equal/9-8 31.0ns ± 0% 13.5ns ± 0% -56.45% (p=0.000 n=10+10) Equal/15-8 40.0ns ± 0% 15.5ns ± 0% -61.25% (p=0.000 n=10+10) Equal/16-8 41.5ns ± 0% 14.5ns ± 0% -65.06% (p=0.000 n=10+10) Equal/20-8 47.5ns ± 0% 17.0ns ± 0% -64.21% (p=0.000 n=10+10) Equal/32-8 65.6ns ± 0% 17.0ns ± 0% -74.09% (p=0.000 n=10+10) Equal/4K-8 6.17µs ± 0% 0.57µs ± 1% -90.76% (p=0.000 n=10+10) Equal/4M-8 6.41ms ± 0% 1.11ms ±14% -82.71% (p=0.000 n=8+10) Equal/64M-8 104ms ± 0% 33ms ± 0% -68.64% (p=0.000 n=10+10) EqualPort/1-8 13.0ns ± 0% 13.0ns ± 0% ~ (all equal) EqualPort/6-8 22.0ns ± 0% 22.7ns ± 0% +3.06% (p=0.000 n=8+9) EqualPort/32-8 78.1ns ± 0% 78.1ns ± 0% ~ (all equal) EqualPort/4K-8 7.54µs ± 0% 7.61µs ± 0% +0.92% (p=0.000 n=10+8) EqualPort/4M-8 8.16ms ± 2% 8.05ms ± 1% -1.31% (p=0.023 n=10+10) EqualPort/64M-8 142ms ± 0% 142ms ± 0% +0.37% (p=0.000 n=10+10) CompareBytesEqual-8 39.0ns ± 0% 41.6ns ± 2% +6.67% (p=0.000 n=9+10) name old speed new speed delta Equal/1-8 86.9MB/s ± 0% 95.2MB/s ± 0% +9.53% (p=0.000 n=8+8) Equal/6-8 315MB/s ± 0% 444MB/s ± 0% +40.74% (p=0.000 n=9+10) Equal/9-8 290MB/s ± 0% 666MB/s ± 0% +129.63% (p=0.000 n=8+10) Equal/15-8 375MB/s ± 0% 967MB/s ± 0% +158.09% (p=0.000 n=10+10) Equal/16-8 385MB/s ± 0% 1103MB/s ± 0% +186.24% (p=0.000 n=10+9) Equal/20-8 421MB/s ± 0% 1175MB/s ± 0% +179.44% (p=0.000 n=9+10) Equal/32-8 488MB/s ± 0% 1881MB/s ± 0% +285.34% (p=0.000 n=10+8) Equal/4K-8 664MB/s ± 0% 7181MB/s ± 1% +981.32% (p=0.000 n=10+10) Equal/4M-8 654MB/s ± 0% 3822MB/s ±16% +484.15% (p=0.000 n=8+10) Equal/64M-8 645MB/s ± 0% 2056MB/s ± 0% +218.90% (p=0.000 n=10+10) EqualPort/1-8 76.8MB/s ± 0% 76.7MB/s ± 0% -0.09% (p=0.023 n=10+10) EqualPort/6-8 272MB/s ± 0% 264MB/s ± 0% -2.94% (p=0.000 n=8+10) EqualPort/32-8 410MB/s ± 0% 410MB/s ± 0% +0.01% (p=0.004 n=9+10) EqualPort/4K-8 543MB/s ± 0% 538MB/s ± 0% -0.91% (p=0.000 n=9+9) EqualPort/4M-8 514MB/s ± 2% 521MB/s ± 1% +1.31% (p=0.023 n=10+10) EqualPort/64M-8 473MB/s ± 0% 472MB/s ± 0% -0.37% (p=0.000 n=10+10) Benchmark results of go1: name old time/op new time/op delta BinaryTree17-8 6.53s ± 0% 6.52s ± 2% ~ (p=0.286 n=4+5) Fannkuch11-8 6.35s ± 1% 6.33s ± 0% ~ (p=0.690 n=5+5) FmtFprintfEmpty-8 108ns ± 1% 99ns ± 1% -8.31% (p=0.008 n=5+5) FmtFprintfString-8 172ns ± 1% 188ns ± 0% +9.43% (p=0.016 n=5+4) FmtFprintfInt-8 207ns ± 0% 202ns ± 0% -2.42% (p=0.008 n=5+5) FmtFprintfIntInt-8 277ns ± 1% 271ns ± 1% -2.02% (p=0.008 n=5+5) FmtFprintfPrefixedInt-8 386ns ± 0% 380ns ± 0% -1.55% (p=0.008 n=5+5) FmtFprintfFloat-8 492ns ± 0% 494ns ± 1% ~ (p=0.175 n=4+5) FmtManyArgs-8 1.32µs ± 1% 1.31µs ± 2% ~ (p=0.651 n=5+5) GobDecode-8 16.8ms ± 2% 16.9ms ± 1% ~ (p=0.310 n=5+5) GobEncode-8 14.1ms ± 1% 14.1ms ± 1% ~ (p=1.000 n=5+5) Gzip-8 788ms ± 0% 789ms ± 0% ~ (p=0.548 n=5+5) Gunzip-8 83.6ms ± 0% 83.6ms ± 0% ~ (p=0.548 n=5+5) HTTPClientServer-8 120µs ± 0% 120µs ± 1% ~ (p=0.690 n=5+5) JSONEncode-8 33.2ms ± 0% 33.6ms ± 0% +1.20% (p=0.008 n=5+5) JSONDecode-8 152ms ± 1% 146ms ± 1% -3.70% (p=0.008 n=5+5) Mandelbrot200-8 10.0ms ± 0% 10.0ms ± 0% ~ (p=0.151 n=5+5) GoParse-8 7.97ms ± 0% 8.06ms ± 0% +1.15% (p=0.008 n=5+5) RegexpMatchEasy0_32-8 233ns ± 1% 239ns ± 4% ~ (p=0.135 n=5+5) RegexpMatchEasy0_1K-8 1.86µs ± 0% 1.86µs ± 0% ~ (p=0.167 n=5+5) RegexpMatchEasy1_32-8 250ns ± 0% 263ns ± 1% +5.28% (p=0.008 n=5+5) RegexpMatchEasy1_1K-8 2.28µs ± 0% 2.13µs ± 0% -6.64% (p=0.000 n=4+5) RegexpMatchMedium_32-8 332ns ± 1% 319ns ± 0% -3.97% (p=0.008 n=5+5) RegexpMatchMedium_1K-8 85.5µs ± 2% 79.1µs ± 1% -7.42% (p=0.008 n=5+5) RegexpMatchHard_32-8 4.34µs ± 1% 4.42µs ± 7% ~ (p=0.881 n=5+5) RegexpMatchHard_1K-8 130µs ± 1% 127µs ± 0% -2.18% (p=0.008 n=5+5) Revcomp-8 1.35s ± 1% 1.34s ± 0% -0.58% (p=0.016 n=5+4) Template-8 160ms ± 2% 158ms ± 1% ~ (p=0.222 n=5+5) TimeParse-8 795ns ± 2% 772ns ± 2% -2.87% (p=0.024 n=5+5) TimeFormat-8 782ns ± 0% 784ns ± 0% ~ (p=0.198 n=5+5) name old speed new speed delta GobDecode-8 45.8MB/s ± 2% 45.5MB/s ± 1% ~ (p=0.310 n=5+5) GobEncode-8 54.3MB/s ± 1% 54.4MB/s ± 1% ~ (p=0.984 n=5+5) Gzip-8 24.6MB/s ± 0% 24.6MB/s ± 0% ~ (p=0.540 n=5+5) Gunzip-8 232MB/s ± 0% 232MB/s ± 0% ~ (p=0.548 n=5+5) JSONEncode-8 58.4MB/s ± 0% 57.7MB/s ± 0% -1.19% (p=0.008 n=5+5) JSONDecode-8 12.8MB/s ± 1% 13.3MB/s ± 1% +3.85% (p=0.008 n=5+5) GoParse-8 7.27MB/s ± 0% 7.18MB/s ± 0% -1.13% (p=0.008 n=5+5) RegexpMatchEasy0_32-8 137MB/s ± 1% 134MB/s ± 4% ~ (p=0.151 n=5+5) RegexpMatchEasy0_1K-8 551MB/s ± 0% 550MB/s ± 0% ~ (p=0.222 n=5+5) RegexpMatchEasy1_32-8 128MB/s ± 0% 121MB/s ± 1% -5.09% (p=0.008 n=5+5) RegexpMatchEasy1_1K-8 449MB/s ± 0% 481MB/s ± 0% +7.12% (p=0.016 n=4+5) RegexpMatchMedium_32-8 3.00MB/s ± 0% 3.13MB/s ± 0% +4.33% (p=0.016 n=4+5) RegexpMatchMedium_1K-8 12.0MB/s ± 2% 12.9MB/s ± 1% +7.98% (p=0.008 n=5+5) RegexpMatchHard_32-8 7.38MB/s ± 1% 7.25MB/s ± 7% ~ (p=0.952 n=5+5) RegexpMatchHard_1K-8 7.88MB/s ± 1% 8.05MB/s ± 0% +2.21% (p=0.008 n=5+5) Revcomp-8 188MB/s ± 1% 189MB/s ± 0% +0.58% (p=0.016 n=5+4) Template-8 12.2MB/s ± 2% 12.3MB/s ± 1% ~ (p=0.183 n=5+5) Change-Id: I65e79f3f8f8b2914678311c4f1b0a2d98459e220 Reviewed-on: https://go-review.googlesource.com/71110 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang --- diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s index 8f2e03c7ef..e4b2c37038 100644 --- a/src/runtime/asm_arm64.s +++ b/src/runtime/asm_arm64.s @@ -723,26 +723,18 @@ TEXT runtime·abort(SB),NOSPLIT,$-8-0 B (ZR) UNDEF -// memequal(p, q unsafe.Pointer, size uintptr) bool +// memequal(a, b unsafe.Pointer, size uintptr) bool TEXT runtime·memequal(SB),NOSPLIT,$-8-25 - MOVD a+0(FP), R1 + MOVD size+16(FP), R1 + // short path to handle 0-byte case + CBZ R1, equal + MOVD a+0(FP), R0 MOVD b+8(FP), R2 - MOVD size+16(FP), R3 - ADD R1, R3, R6 + MOVD $ret+24(FP), R8 + B runtime·memeqbody<>(SB) +equal: MOVD $1, R0 MOVB R0, ret+24(FP) - CMP R1, R2 - BEQ done -loop: - CMP R1, R6 - BEQ done - MOVBU.P 1(R1), R4 - MOVBU.P 1(R2), R5 - CMP R4, R5 - BEQ loop - - MOVB $0, ret+24(FP) -done: RET // memequal_varlen(a, b unsafe.Pointer) bool @@ -865,28 +857,110 @@ notfound: MOVD R0, ret+24(FP) RET -// TODO: share code with memequal? +// Equal(a, b []byte) bool TEXT bytes·Equal(SB),NOSPLIT,$0-49 MOVD a_len+8(FP), R1 MOVD b_len+32(FP), R3 - CMP R1, R3 // unequal lengths are not equal - BNE notequal + CMP R1, R3 + // unequal lengths are not equal + BNE not_equal + // short path to handle 0-byte case + CBZ R1, equal MOVD a+0(FP), R0 MOVD b+24(FP), R2 - ADD R0, R1 // end -loop: - CMP R0, R1 - BEQ equal // reaches the end - MOVBU.P 1(R0), R4 - MOVBU.P 1(R2), R5 - CMP R4, R5 - BEQ loop -notequal: + MOVD $ret+48(FP), R8 + B runtime·memeqbody<>(SB) +equal: + MOVD $1, R0 + MOVB R0, ret+48(FP) + RET +not_equal: MOVB ZR, ret+48(FP) RET + +// input: +// R0: pointer a +// R1: data len +// R2: pointer b +// R8: address to put result +TEXT runtime·memeqbody<>(SB),NOSPLIT,$0 + CMP $1, R1 + // handle 1-byte special case for better performance + BEQ one + CMP $16, R1 + // handle specially if length < 16 + BLO tail + BIC $0x3f, R1, R3 + CBZ R3, chunk16 + // work with 64-byte chunks + ADD R3, R0, R6 // end of chunks +chunk64_loop: + VLD1.P (R0), [V0.D2, V1.D2, V2.D2, V3.D2] + VLD1.P (R2), [V4.D2, V5.D2, V6.D2, V7.D2] + VCMEQ V0.D2, V4.D2, V8.D2 + VCMEQ V1.D2, V5.D2, V9.D2 + VCMEQ V2.D2, V6.D2, V10.D2 + VCMEQ V3.D2, V7.D2, V11.D2 + VAND V8.B16, V9.B16, V8.B16 + VAND V8.B16, V10.B16, V8.B16 + VAND V8.B16, V11.B16, V8.B16 + CMP R0, R6 + VMOV V8.D[0], R4 + VMOV V8.D[1], R5 + CBZ R4, not_equal + CBZ R5, not_equal + BNE chunk64_loop + AND $0x3f, R1, R1 + CBZ R1, equal +chunk16: + // work with 16-byte chunks + BIC $0xf, R1, R3 + CBZ R3, tail + ADD R3, R0, R6 // end of chunks +chunk16_loop: + VLD1.P (R0), [V0.D2] + VLD1.P (R2), [V1.D2] + VCMEQ V0.D2, V1.D2, V2.D2 + CMP R0, R6 + VMOV V2.D[0], R4 + VMOV V2.D[1], R5 + CBZ R4, not_equal + CBZ R5, not_equal + BNE chunk16_loop + AND $0xf, R1, R1 + CBZ R1, equal +tail: + // special compare of tail with length < 16 + TBZ $3, R1, lt_8 + MOVD.P 8(R0), R4 + MOVD.P 8(R2), R5 + CMP R4, R5 + BNE not_equal +lt_8: + TBZ $2, R1, lt_4 + MOVWU.P 4(R0), R4 + MOVWU.P 4(R2), R5 + CMP R4, R5 + BNE not_equal +lt_4: + TBZ $1, R1, lt_2 + MOVHU.P 2(R0), R4 + MOVHU.P 2(R2), R5 + CMP R4, R5 + BNE not_equal +lt_2: + TBZ $0, R1, equal +one: + MOVBU (R0), R4 + MOVBU (R2), R5 + CMP R4, R5 + BNE not_equal equal: MOVD $1, R0 - MOVB R0, ret+48(FP) + MOVB R0, (R8) + RET +not_equal: + MOVB ZR, (R8) RET TEXT runtime·return0(SB), NOSPLIT, $0