From: erifan01 Date: Wed, 25 Apr 2018 08:45:52 +0000 (+0000) Subject: internal/bytealg: optimize Index (substring lengths from 9 to 32) on arm64 X-Git-Tag: go1.11beta1~585 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=f8ef6ed24a65ef50cb81510a5720abf406c90642;p=gostls13.git internal/bytealg: optimize Index (substring lengths from 9 to 32) on arm64 The current code is not optimized for cases where the length of the substring to be searched is between 9 bytes and 32 bytes. This CL optimizes the situations. Benchmark: name old time/op new time/op delta pkg:strings goos:linux goarch:arm64 IndexHard1-8 1.06ms ± 0% 1.06ms ± 0% -0.44% (p=0.000 n=7+8) IndexHard2-8 1.25ms ± 1% 1.26ms ± 2% ~ (p=0.328 n=8+8) IndexHard3-8 2.85ms ± 1% 1.18ms ± 1% -58.59% (p=0.000 n=8+8) IndexHard4-8 2.90ms ± 1% 2.87ms ± 1% -0.96% (p=0.021 n=8+8) pkg:bytes goos:linux goarch:arm64 IndexByte/4M-8 726124.200000ns +- 6% 560021.400000ns +-20% -22.88% (p=0.008 n=5+5) IndexRune/4M-8 928768.600000ns +- 0% 793144.600000ns +- 6% -14.60% (p=0.008 n=5+5) Change-Id: Ieebeb784ae69b2a0642ea96e9486a1d120923568 Reviewed-on: https://go-review.googlesource.com/109895 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot --- diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go index 0f87ae106c..251e63567e 100644 --- a/src/internal/bytealg/index_arm64.go +++ b/src/internal/bytealg/index_arm64.go @@ -9,8 +9,8 @@ package bytealg const MaxBruteForce = 16 func init() { - // 8 bytes can be completely loaded into 1 register. - MaxLen = 8 + // Optimize cases where the length of the substring is less than 32 bytes + MaxLen = 32 } // Cutover reports the number of failures of IndexByte we should tolerate diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s index 02eb658fd0..20d68ba9b8 100644 --- a/src/internal/bytealg/index_arm64.s +++ b/src/internal/bytealg/index_arm64.s @@ -25,7 +25,7 @@ TEXT ·IndexString(SB),NOSPLIT,$0-40 // R0: haystack // R1: length of haystack // R2: needle -// R3: length of needle (2 <= len <= 8) +// R3: length of needle (2 <= len <= 32) // R9: address to put result TEXT indexbody<>(SB),NOSPLIT,$0-56 // main idea is to load 'sep' into separate register(s) @@ -35,9 +35,12 @@ TEXT indexbody<>(SB),NOSPLIT,$0-56 // R4 contains the start of last substring for comparsion ADD R0, R4, R4 ADD $1, R0, R8 + + CMP $8, R3 + BHI greater_8 TBZ $3, R3, len_2_7 len_8: - // R5 contains 8-byte sep + // R5 contains 8-byte of sep MOVD (R2), R5 loop_8: // R6 contains substring for comparison @@ -52,7 +55,7 @@ len_2_7: TBZ $1, R3, len_4_5 TBZ $0, R3, len_6 len_7: - // R5 and R6 contain 7-byte sep + // R5 and R6 contain 7-byte of sep MOVWU (R2), R5 // 1-byte overlap with R5 MOVWU 3(R2), R6 @@ -67,7 +70,7 @@ loop_7: BNE loop_7 B found len_6: - // R5 and R6 contain 6-byte sep + // R5 and R6 contain 6-byte of sep MOVWU (R2), R5 MOVHU 4(R2), R6 loop_6: @@ -83,7 +86,7 @@ loop_6: len_4_5: TBZ $0, R3, len_4 len_5: - // R5 and R7 contain 5-byte sep + // R5 and R7 contain 5-byte of sep MOVWU (R2), R5 MOVBU 4(R2), R7 loop_5: @@ -97,7 +100,7 @@ loop_5: BNE loop_5 B found len_4: - // R5 contains 4-byte sep + // R5 contains 4-byte of sep MOVWU (R2), R5 loop_4: CMP R4, R0 @@ -109,7 +112,7 @@ loop_4: len_2_3: TBZ $0, R3, len_2 len_3: - // R6 and R7 contain 3-byte sep + // R6 and R7 contain 3-byte of sep MOVHU (R2), R6 MOVBU 2(R2), R7 loop_3: @@ -123,7 +126,7 @@ loop_3: BNE loop_3 B found len_2: - // R5 contains 2-byte sep + // R5 contains 2-byte of sep MOVHU (R2), R5 loop_2: CMP R4, R0 @@ -139,3 +142,65 @@ not_found: MOVD $-1, R0 MOVD R0, (R9) RET +greater_8: + SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes + CMP $16, R3 + BHI greater_16 +len_9_16: + MOVD.P 8(R2), R5 // R5 contains the first 8-byte of sep + SUB $16, R3, R7 // len(sep) - 16, offset of R2 for last 8 bytes + MOVD (R2)(R7), R6 // R6 contains the last 8-byte of sep +loop_9_16: + // search the first 8 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R7 + CMP R5, R7 + BNE loop_9_16 + MOVD (R0)(R11), R7 + CMP R6, R7 // compare the last 8 bytes + BNE loop_9_16 + B found +greater_16: + CMP $24, R3 + BHI len_25_32 +len_17_24: + LDP.P 16(R2), (R5, R6) // R5 and R6 contain the first 16-byte of sep + SUB $24, R3, R10 // len(sep) - 24 + MOVD (R2)(R10), R7 // R7 contains the last 8-byte of sep +loop_17_24: + // search the first 16 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R10 + CMP R5, R10 + BNE loop_17_24 + MOVD 7(R0), R10 + CMP R6, R10 + BNE loop_17_24 + MOVD (R0)(R11), R10 + CMP R7, R10 // compare the last 8 bytes + BNE loop_17_24 + B found +len_25_32: + LDP.P 16(R2), (R5, R6) + MOVD.P 8(R2), R7 // R5, R6 and R7 contain the first 24-byte of sep + SUB $32, R3, R12 // len(sep) - 32 + MOVD (R2)(R12), R10 // R10 contains the last 8-byte of sep +loop_25_32: + // search the first 24 bytes first + CMP R4, R0 + BHI not_found + MOVD.P 1(R0), R12 + CMP R5, R12 + BNE loop_25_32 + MOVD 7(R0), R12 + CMP R6, R12 + BNE loop_25_32 + MOVD 15(R0), R12 + CMP R7, R12 + BNE loop_25_32 + MOVD (R0)(R11), R12 + CMP R10, R12 // compare the last 8 bytes + BNE loop_25_32 + B found