Move optimized Count implementation from bytes to runtime. Use in
both bytes and strings packages.
Add CountByte benchmark to strings.
Strings benchmarks:
name old time/op new time/op delta
CountHard1-4 226µs ± 1% 226µs ± 2% ~ (p=0.247 n=10+10)
CountHard2-4 316µs ± 1% 315µs ± 0% ~ (p=0.133 n=9+10)
CountHard3-4 919µs ± 1% 920µs ± 1% ~ (p=0.968 n=10+9)
CountTorture-4 15.4µs ± 1% 15.7µs ± 1% +2.47% (p=0.000 n=10+9)
CountTortureOverlapping-4 9.60ms ± 0% 9.65ms ± 1% ~ (p=0.247 n=10+10)
CountByte/10-4 26.3ns ± 1% 10.9ns ± 1% -58.71% (p=0.000 n=9+9)
CountByte/32-4 42.7ns ± 0% 14.2ns ± 0% -66.64% (p=0.000 n=10+10)
CountByte/4096-4 3.07µs ± 0% 0.31µs ± 2% -89.99% (p=0.000 n=9+10)
CountByte/
4194304-4 3.48ms ± 1% 0.34ms ± 1% -90.09% (p=0.000 n=10+9)
CountByte/
67108864-4 55.6ms ± 1% 7.0ms ± 0% -87.49% (p=0.000 n=9+8)
name old speed new speed delta
CountByte/10-4 380MB/s ± 1% 919MB/s ± 1% +142.21% (p=0.000 n=9+9)
CountByte/32-4 750MB/s ± 0% 2247MB/s ± 0% +199.62% (p=0.000 n=10+10)
CountByte/4096-4 1.33GB/s ± 0% 13.32GB/s ± 2% +898.13% (p=0.000 n=9+10)
CountByte/
4194304-4 1.21GB/s ± 1% 12.17GB/s ± 1% +908.87% (p=0.000 n=10+9)
CountByte/
67108864-4 1.21GB/s ± 1% 9.65GB/s ± 0% +699.29% (p=0.000 n=9+8)
Fixes #19411
Change-Id: I8d2d409f0fa6df6d03b60790aa86e540b4a4e3b0
Reviewed-on: https://go-review.googlesource.com/38693
Reviewed-by: Keith Randall <khr@golang.org>
// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
// indexShortStr requires 2 <= len(c) <= shortStringLen
-func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s
-func supportAVX2() bool // ../runtime/asm_$GOARCH.s
-func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s
+func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s
+func supportAVX2() bool // ../runtime/asm_$GOARCH.s
+func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s
+func countByte(s []byte, c byte) int // ../runtime/asm_$GOARCH.s
var shortStringLen int
return -1
}
-// Special case for when we must count occurrences of a single byte.
-func countByte(s []byte, c byte) int
-
// Count counts the number of non-overlapping instances of sep in s.
// If sep is an empty slice, Count returns 1 + the number of Unicode code points in s.
func Count(s, sep []byte) int {
+++ /dev/null
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// We use:
-// SI: data
-// BX: data len
-// AL: byte sought
-// This requires the POPCNT instruction
-TEXT ·countByte(SB),NOSPLIT,$0-40
- MOVQ s+0(FP), SI
- MOVQ s_len+8(FP), BX
- MOVB c+24(FP), AL
-
- // Shuffle X0 around so that each byte contains
- // the character we're looking for.
- MOVD AX, X0
- PUNPCKLBW X0, X0
- PUNPCKLBW X0, X0
- PSHUFL $0, X0, X0
-
- CMPQ BX, $16
- JLT small
-
- MOVQ $0, R12 // Accumulator
-
- MOVQ SI, DI
-
- CMPQ BX, $32
- JA avx2
-sse:
- LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
- JMP sseloopentry
-
-sseloop:
- // Move the next 16-byte chunk of the data into X1.
- MOVOU (DI), X1
- // Compare bytes in X0 to X1.
- PCMPEQB X0, X1
- // Take the top bit of each byte in X1 and put the result in DX.
- PMOVMSKB X1, DX
- // Count number of matching bytes
- POPCNTL DX, DX
- // Accumulate into R12
- ADDQ DX, R12
- // Advance to next block.
- ADDQ $16, DI
-sseloopentry:
- CMPQ DI, AX
- JBE sseloop
-
- // Get the number of bytes to consider in the last 16 bytes
- ANDQ $15, BX
- JZ end
-
- // Create mask to ignore overlap between previous 16 byte block
- // and the next.
- MOVQ $16,CX
- SUBQ BX, CX
- MOVQ $0xFFFF, R10
- SARQ CL, R10
- SALQ CL, R10
-
- // Process the last 16-byte chunk. This chunk may overlap with the
- // chunks we've already searched so we need to mask part of it.
- MOVOU (AX), X1
- PCMPEQB X0, X1
- PMOVMSKB X1, DX
- // Apply mask
- ANDQ R10, DX
- POPCNTL DX, DX
- ADDQ DX, R12
-end:
- MOVQ R12, ret+32(FP)
- RET
-
-// handle for lengths < 16
-small:
- TESTQ BX, BX
- JEQ endzero
-
- // Check if we'll load across a page boundary.
- LEAQ 16(SI), AX
- TESTW $0xff0, AX
- JEQ endofpage
-
- // We must ignore high bytes as they aren't part of our slice.
- // Create mask.
- MOVB BX, CX
- MOVQ $1, R10
- SALQ CL, R10
- SUBQ $1, R10
-
- // Load data
- MOVOU (SI), X1
- // Compare target byte with each byte in data.
- PCMPEQB X0, X1
- // Move result bits to integer register.
- PMOVMSKB X1, DX
- // Apply mask
- ANDQ R10, DX
- POPCNTL DX, DX
- // Directly return DX, we don't need to accumulate
- // since we have <16 bytes.
- MOVQ DX, ret+32(FP)
- RET
-endzero:
- MOVQ $0, ret+32(FP)
- RET
-
-endofpage:
- // We must ignore low bytes as they aren't part of our slice.
- MOVQ $16,CX
- SUBQ BX, CX
- MOVQ $0xFFFF, R10
- SARQ CL, R10
- SALQ CL, R10
-
- // Load data into the high end of X1.
- MOVOU -16(SI)(BX*1), X1
- // Compare target byte with each byte in data.
- PCMPEQB X0, X1
- // Move result bits to integer register.
- PMOVMSKB X1, DX
- // Apply mask
- ANDQ R10, DX
- // Directly return DX, we don't need to accumulate
- // since we have <16 bytes.
- POPCNTL DX, DX
- MOVQ DX, ret+32(FP)
- RET
-
-avx2:
- CMPB runtime·support_avx2(SB), $1
- JNE sse
- MOVD AX, X0
- LEAQ -32(SI)(BX*1), R11
- VPBROADCASTB X0, Y1
-avx2_loop:
- VMOVDQU (DI), Y2
- VPCMPEQB Y1, Y2, Y3
- VPMOVMSKB Y3, DX
- POPCNTL DX, DX
- ADDQ DX, R12
- ADDQ $32, DI
- CMPQ DI, R11
- JLE avx2_loop
-
- // If last block is already processed,
- // skip to the end.
- CMPQ DI, R11
- JEQ endavx
-
- // Load address of the last 32 bytes.
- // There is an overlap with the previous block.
- MOVQ R11, DI
- VMOVDQU (DI), Y2
- VPCMPEQB Y1, Y2, Y3
- VPMOVMSKB Y3, DX
- // Exit AVX mode.
- VZEROUPPER
-
- // Create mask to ignore overlap between previous 32 byte block
- // and the next.
- ANDQ $31, BX
- MOVQ $32,CX
- SUBQ BX, CX
- MOVQ $0xFFFFFFFF, R10
- SARQ CL, R10
- SALQ CL, R10
- // Apply mask
- ANDQ R10, DX
- POPCNTL DX, DX
- ADDQ DX, R12
- MOVQ R12, ret+32(FP)
- RET
-endavx:
- // Exit AVX mode.
- VZEROUPPER
- MOVQ R12, ret+32(FP)
- RET
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportAVX2 is in package strings
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportAVX2 is in package bytes
+runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportPOPCNT is in package strings
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportPOPCNT is in package bytes
+runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: countByte is in package strings
+runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: countByte is in package bytes
// Intentionally missing declarations. These are special assembly routines.
// Some are jumped into from other routines, with values in specific registers.
MOVB AX, ret+0(FP)
RET
+TEXT strings·supportPOPCNT(SB),NOSPLIT,$0-1
+ MOVBLZX runtime·support_popcnt(SB), AX
+ MOVB AX, ret+0(FP)
+ RET
+
TEXT bytes·supportPOPCNT(SB),NOSPLIT,$0-1
MOVBLZX runtime·support_popcnt(SB), AX
MOVB AX, ret+0(FP)
MOVB $0, ret+48(FP)
RET
+
+TEXT bytes·countByte(SB),NOSPLIT,$0-40
+ MOVQ s+0(FP), SI
+ MOVQ s_len+8(FP), BX
+ MOVB c+24(FP), AL
+ LEAQ ret+32(FP), R8
+ JMP runtime·countByte(SB)
+
+TEXT strings·countByte(SB),NOSPLIT,$0-32
+ MOVQ s+0(FP), SI
+ MOVQ s_len+8(FP), BX
+ MOVB c+16(FP), AL
+ LEAQ ret+24(FP), R8
+ JMP runtime·countByte(SB)
+
+// input:
+// SI: data
+// BX: data len
+// AL: byte sought
+// R8: address to put result
+// This requires the POPCNT instruction
+TEXT runtime·countByte(SB),NOSPLIT,$0
+ // Shuffle X0 around so that each byte contains
+ // the character we're looking for.
+ MOVD AX, X0
+ PUNPCKLBW X0, X0
+ PUNPCKLBW X0, X0
+ PSHUFL $0, X0, X0
+
+ CMPQ BX, $16
+ JLT small
+
+ MOVQ $0, R12 // Accumulator
+
+ MOVQ SI, DI
+
+ CMPQ BX, $32
+ JA avx2
+sse:
+ LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
+ JMP sseloopentry
+
+sseloop:
+ // Move the next 16-byte chunk of the data into X1.
+ MOVOU (DI), X1
+ // Compare bytes in X0 to X1.
+ PCMPEQB X0, X1
+ // Take the top bit of each byte in X1 and put the result in DX.
+ PMOVMSKB X1, DX
+ // Count number of matching bytes
+ POPCNTL DX, DX
+ // Accumulate into R12
+ ADDQ DX, R12
+ // Advance to next block.
+ ADDQ $16, DI
+sseloopentry:
+ CMPQ DI, AX
+ JBE sseloop
+
+ // Get the number of bytes to consider in the last 16 bytes
+ ANDQ $15, BX
+ JZ end
+
+ // Create mask to ignore overlap between previous 16 byte block
+ // and the next.
+ MOVQ $16,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+
+ // Process the last 16-byte chunk. This chunk may overlap with the
+ // chunks we've already searched so we need to mask part of it.
+ MOVOU (AX), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+end:
+ MOVQ R12, (R8)
+ RET
+
+// handle for lengths < 16
+small:
+ TESTQ BX, BX
+ JEQ endzero
+
+ // Check if we'll load across a page boundary.
+ LEAQ 16(SI), AX
+ TESTW $0xff0, AX
+ JEQ endofpage
+
+ // We must ignore high bytes as they aren't part of our slice.
+ // Create mask.
+ MOVB BX, CX
+ MOVQ $1, R10
+ SALQ CL, R10
+ SUBQ $1, R10
+
+ // Load data
+ MOVOU (SI), X1
+ // Compare target byte with each byte in data.
+ PCMPEQB X0, X1
+ // Move result bits to integer register.
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ // Directly return DX, we don't need to accumulate
+ // since we have <16 bytes.
+ MOVQ DX, (R8)
+ RET
+endzero:
+ MOVQ $0, (R8)
+ RET
+
+endofpage:
+ // We must ignore low bytes as they aren't part of our slice.
+ MOVQ $16,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+
+ // Load data into the high end of X1.
+ MOVOU -16(SI)(BX*1), X1
+ // Compare target byte with each byte in data.
+ PCMPEQB X0, X1
+ // Move result bits to integer register.
+ PMOVMSKB X1, DX
+ // Apply mask
+ ANDQ R10, DX
+ // Directly return DX, we don't need to accumulate
+ // since we have <16 bytes.
+ POPCNTL DX, DX
+ MOVQ DX, (R8)
+ RET
+
+avx2:
+ CMPB runtime·support_avx2(SB), $1
+ JNE sse
+ MOVD AX, X0
+ LEAQ -32(SI)(BX*1), R11
+ VPBROADCASTB X0, Y1
+avx2_loop:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+ ADDQ $32, DI
+ CMPQ DI, R11
+ JLE avx2_loop
+
+ // If last block is already processed,
+ // skip to the end.
+ CMPQ DI, R11
+ JEQ endavx
+
+ // Load address of the last 32 bytes.
+ // There is an overlap with the previous block.
+ MOVQ R11, DI
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, DX
+ // Exit AVX mode.
+ VZEROUPPER
+
+ // Create mask to ignore overlap between previous 32 byte block
+ // and the next.
+ ANDQ $31, BX
+ MOVQ $32,CX
+ SUBQ BX, CX
+ MOVQ $0xFFFFFFFF, R10
+ SARQ CL, R10
+ SALQ CL, R10
+ // Apply mask
+ ANDQ R10, DX
+ POPCNTL DX, DX
+ ADDQ DX, R12
+ MOVQ R12, (R8)
+ RET
+endavx:
+ // Exit AVX mode.
+ VZEROUPPER
+ MOVQ R12, (R8)
+ RET
+
TEXT runtime·return0(SB), NOSPLIT, $0
MOVL $0, AX
RET
return hash, pow
}
-// Count counts the number of non-overlapping instances of substr in s.
-// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
-func Count(s, substr string) int {
+// countGeneric implements Count.
+func countGeneric(s, substr string) int {
// special case
if len(substr) == 0 {
return utf8.RuneCountInString(s) + 1
// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
// indexShortStr requires 2 <= len(c) <= shortStringLen
-func indexShortStr(s, c string) int // ../runtime/asm_$GOARCH.s
-func supportAVX2() bool // ../runtime/asm_$GOARCH.s
+func indexShortStr(s, c string) int // ../runtime/asm_$GOARCH.s
+func supportAVX2() bool // ../runtime/asm_$GOARCH.s
+func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s
+func countByte(s string, c byte) int // ../runtime/asm_$GOARCH.s
var shortStringLen int
}
return -1
}
+
+// Count counts the number of non-overlapping instances of substr in s.
+// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
+func Count(s, substr string) int {
+ if len(substr) == 1 && supportPOPCNT() {
+ return countByte(s, byte(substr[0]))
+ }
+ return countGeneric(s, substr)
+}
}
return -1
}
+
+// Count counts the number of non-overlapping instances of substr in s.
+// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
+func Count(s, substr string) int {
+ return countGeneric(s, substr)
+}
}
return -1
}
+
+// Count counts the number of non-overlapping instances of substr in s.
+// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
+func Count(s, substr string) int {
+ return countGeneric(s, substr)
+}
}
}
+func BenchmarkCountByte(b *testing.B) {
+ indexSizes := []int{10, 32, 4 << 10, 4 << 20, 64 << 20}
+ benchStr := Repeat(benchmarkString,
+ (indexSizes[len(indexSizes)-1]+len(benchmarkString)-1)/len(benchmarkString))
+ benchFunc := func(b *testing.B, benchStr string) {
+ b.SetBytes(int64(len(benchStr)))
+ for i := 0; i < b.N; i++ {
+ Count(benchStr, "=")
+ }
+ }
+ for _, size := range indexSizes {
+ b.Run(fmt.Sprintf("%d", size), func(b *testing.B) {
+ benchFunc(b, benchStr[:size])
+ })
+ }
+
+}
+
var makeFieldsInput = func() string {
x := make([]byte, 1<<20)
// Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.