From d4e936cfd622de322dc93f69144c68bb4c133e21 Mon Sep 17 00:00:00 2001 From: erifan01 Date: Tue, 17 Apr 2018 07:37:38 +0000 Subject: [PATCH] internal/bytealg: optimize IndexString on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This CL adjusts the order of the branch instructions of the code to make it easier for the LIKELY branch to happen. Benchmarks: name old time/op new time/op delta pkg:strings goos:linux goarch:arm64 IndexHard2-8 2.17ms ± 1% 1.23ms ± 0% -43.34% (p=0.008 n=5+5) CountHard2-8 2.13ms ± 1% 1.21ms ± 2% -43.31% (p=0.008 n=5+5) pkg:bytes goos:linux goarch:arm64 IndexRune/4M-8 661µs ±22% 513µs ± 0% -22.32% (p=0.008 n=5+5) IndexEasy/4M-8 672µs ±23% 513µs ± 0% -23.71% (p=0.016 n=5+4) Change-Id: Ib96f095edf77747edc8a971e79f5c1428e5808ce Reviewed-on: https://go-review.googlesource.com/109015 Reviewed-by: Cherry Zhang Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot --- src/internal/bytealg/index_arm64.s | 74 ++++++++++++++---------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s index 6c93ef3ce8..02eb658fd0 100644 --- a/src/internal/bytealg/index_arm64.s +++ b/src/internal/bytealg/index_arm64.s @@ -41,12 +41,12 @@ len_8: MOVD (R2), R5 loop_8: // R6 contains substring for comparison + CMP R4, R0 + BHI not_found MOVD.P 1(R0), R6 CMP R5, R6 - BEQ found - CMP R4, R0 - BLS loop_8 - JMP not_found + BNE loop_8 + B found len_2_7: TBZ $2, R3, len_2_3 TBZ $1, R3, len_4_5 @@ -57,31 +57,29 @@ len_7: // 1-byte overlap with R5 MOVWU 3(R2), R6 loop_7: + CMP R4, R0 + BHI not_found MOVWU.P 1(R0), R3 CMP R5, R3 - BNE not_equal_7 + BNE loop_7 MOVWU 2(R0), R3 CMP R6, R3 - BEQ found -not_equal_7: - CMP R4, R0 - BLS loop_7 - JMP not_found + BNE loop_7 + B found len_6: // R5 and R6 contain 6-byte sep MOVWU (R2), R5 MOVHU 4(R2), R6 loop_6: + CMP R4, R0 + BHI not_found MOVWU.P 1(R0), R3 CMP R5, R3 - BNE not_equal_6 + BNE loop_6 MOVHU 3(R0), R3 CMP R6, R3 - BEQ found -not_equal_6: - CMP R4, R0 - BLS loop_6 - JMP not_found + BNE loop_6 + B found len_4_5: TBZ $0, R3, len_4 len_5: @@ -89,26 +87,25 @@ len_5: MOVWU (R2), R5 MOVBU 4(R2), R7 loop_5: + CMP R4, R0 + BHI not_found MOVWU.P 1(R0), R3 CMP R5, R3 - BNE not_equal_5 + BNE loop_5 MOVBU 3(R0), R3 CMP R7, R3 - BEQ found -not_equal_5: - CMP R4, R0 - BLS loop_5 - JMP not_found + BNE loop_5 + B found len_4: // R5 contains 4-byte sep MOVWU (R2), R5 loop_4: + CMP R4, R0 + BHI not_found MOVWU.P 1(R0), R6 CMP R5, R6 - BEQ found - CMP R4, R0 - BLS loop_4 - JMP not_found + BNE loop_4 + B found len_2_3: TBZ $0, R3, len_2 len_3: @@ -116,30 +113,29 @@ len_3: MOVHU (R2), R6 MOVBU 2(R2), R7 loop_3: + CMP R4, R0 + BHI not_found MOVHU.P 1(R0), R3 CMP R6, R3 - BNE not_equal_3 + BNE loop_3 MOVBU 1(R0), R3 CMP R7, R3 - BEQ found -not_equal_3: - CMP R4, R0 - BLS loop_3 - JMP not_found + BNE loop_3 + B found len_2: // R5 contains 2-byte sep MOVHU (R2), R5 loop_2: + CMP R4, R0 + BHI not_found MOVHU.P 1(R0), R6 CMP R5, R6 - BEQ found - CMP R4, R0 - BLS loop_2 -not_found: - MOVD $-1, R0 - MOVD R0, (R9) - RET + BNE loop_2 found: SUB R8, R0, R0 MOVD R0, (R9) RET +not_found: + MOVD $-1, R0 + MOVD R0, (R9) + RET -- 2.50.0