From: Vasily Leonenko Date: Mon, 31 Mar 2025 20:25:00 +0000 (+0300) Subject: internal/bytealg: optimize Count/CountString on arm64 X-Git-Tag: go1.25rc1~554 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=f2e9076764b35cc34d9a166aa3b074c203aea0a2;p=gostls13.git internal/bytealg: optimize Count/CountString on arm64 Introduce ABIInternal support for Count/CountString Move <32 size block from function end to beginning as fastpath goos: linux goarch: arm64 pkg: strings │ base.txt │ new.txt │ │ B/s │ B/s vs base │ CountByte/10 672.5Mi ± 0% 692.9Mi ± 0% +3.04% (p=0.000 n=10) CountByte/32 3.592Gi ± 0% 3.970Gi ± 0% +10.53% (p=0.000 n=10) CountByte/4096 16.63Gi ± 0% 16.73Gi ± 0% +0.64% (p=0.000 n=10) CountByte/4194304 14.97Gi ± 2% 15.02Gi ± 1% ~ (p=0.190 n=10) CountByte/67108864 12.50Gi ± 0% 12.50Gi ± 0% ~ (p=0.853 n=10) geomean 5.931Gi 6.099Gi +2.83% Change-Id: I5af1be2b117d9fb8d570739637499923de62251c Reviewed-on: https://go-review.googlesource.com/c/go/+/662395 LUCI-TryBot-Result: Go LUCI Auto-Submit: Keith Randall Reviewed-by: Keith Randall Commit-Queue: Keith Randall Reviewed-by: Cherry Mui Reviewed-by: Keith Randall --- diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s index e616627b1a..1e39cd5f3d 100644 --- a/src/internal/bytealg/count_arm64.s +++ b/src/internal/bytealg/count_arm64.s @@ -5,33 +5,45 @@ #include "go_asm.h" #include "textflag.h" -TEXT ·Count(SB),NOSPLIT,$0-40 - MOVD b_base+0(FP), R0 - MOVD b_len+8(FP), R2 - MOVBU c+24(FP), R1 - MOVD $ret+32(FP), R8 - B countbytebody<>(SB) - -TEXT ·CountString(SB),NOSPLIT,$0-32 - MOVD s_base+0(FP), R0 - MOVD s_len+8(FP), R2 - MOVBU c+16(FP), R1 - MOVD $ret+24(FP), R8 - B countbytebody<>(SB) +// func Count(b []byte, c byte) int +// input: +// R0: b ptr +// R1: b len +// R2: b cap +// R3: c byte to search +// return: +// R0: result +TEXT ·Count(SB),NOSPLIT,$0-40 + MOVD R3, R2 + B ·CountString(SB) +// func CountString(s string, c byte) int // input: -// R0: data -// R2: data len -// R1: byte to find -// R8: address to put result -TEXT countbytebody<>(SB),NOSPLIT,$0 +// R0: s ptr +// R1: s len +// R2: c byte to search (due to ABIInternal upper bits can contain junk) +// return: +// R0: result +TEXT ·CountString(SB),NOSPLIT,$0-32 // R11 = count of byte to search MOVD $0, R11 // short path to handle 0-byte case - CBZ R2, done - CMP $0x20, R2 - // jump directly to tail if length < 32 - BLO tail + CBZ R1, done + CMP $0x20, R1 + // jump directly to head if length >= 32 + BHS head +tail: + // Work with tail shorter than 32 bytes + MOVBU.P 1(R0), R5 + SUB $1, R1, R1 + CMP R2.UXTB, R5 + CINC EQ, R11, R11 + CBNZ R1, tail +done: + MOVD R11, R0 + RET + PCALIGN $16 +head: ANDS $0x1f, R0, R9 BEQ chunk // Work with not 32-byte aligned head @@ -40,24 +52,23 @@ TEXT countbytebody<>(SB),NOSPLIT,$0 PCALIGN $16 head_loop: MOVBU.P 1(R0), R5 - CMP R5, R1 + CMP R2.UXTB, R5 CINC EQ, R11, R11 - SUB $1, R2, R2 + SUB $1, R1, R1 CMP R0, R3 BNE head_loop - // Work with 32-byte aligned chunks chunk: - BIC $0x1f, R2, R9 + BIC $0x1f, R1, R9 // The first chunk can also be the last CBZ R9, tail // R3 = end of 32-byte chunks ADD R0, R9, R3 MOVD $1, R5 VMOV R5, V5.B16 - // R2 = length of tail - SUB R9, R2, R2 - // Duplicate R1 (byte to search) to 16 1-byte elements of V0 - VMOV R1, V0.B16 + // R1 = length of tail + SUB R9, R1, R1 + // Duplicate R2 (byte to search) to 16 1-byte elements of V0 + VMOV R2, V0.B16 // Clear the low 64-bit element of V7 and V8 VEOR V7.B8, V7.B8, V7.B8 VEOR V8.B8, V8.B8, V8.B8 @@ -79,14 +90,5 @@ chunk_loop: BNE chunk_loop VMOV V8.D[0], R6 ADD R6, R11, R11 - CBZ R2, done -tail: - // Work with tail shorter than 32 bytes - MOVBU.P 1(R0), R5 - SUB $1, R2, R2 - CMP R5, R1 - CINC EQ, R11, R11 - CBNZ R2, tail -done: - MOVD R11, (R8) - RET + CBZ R1, done + B tail