}
}
+// test a small index across all page offsets
+func TestIndexByteSmall(t *testing.T) {
+ b := make([]byte, 5015) // bigger than a page
+ // Make sure we find the correct byte even when straddling a page.
+ for i := 0; i <= len(b)-15; i++ {
+ for j := 0; j < 15; j++ {
+ b[i+j] = byte(100 + j)
+ }
+ for j := 0; j < 15; j++ {
+ p := IndexByte(b[i:i+15], byte(100+j))
+ if p != j {
+ t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 100+j, p)
+ }
+ }
+ for j := 0; j < 15; j++ {
+ b[i+j] = 0
+ }
+ }
+ // Make sure matches outside the slice never trigger.
+ for i := 0; i <= len(b)-15; i++ {
+ for j := 0; j < 15; j++ {
+ b[i+j] = 1
+ }
+ for j := 0; j < 15; j++ {
+ p := IndexByte(b[i:i+15], byte(0))
+ if p != -1 {
+ t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 0, p)
+ }
+ }
+ for j := 0; j < 15; j++ {
+ b[i+j] = 0
+ }
+ }
+}
+
func TestIndexRune(t *testing.T) {
for _, tt := range indexRuneTests {
a := []byte(tt.a)
var bmbuf []byte
+func BenchmarkIndexByte10(b *testing.B) { bmIndexByte(b, IndexByte, 10) }
func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) }
func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) }
func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) }
func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) }
+func BenchmarkIndexBytePortable10(b *testing.B) { bmIndexByte(b, IndexBytePortable, 10) }
func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) }
func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) }
func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) }
// AL: byte sought
// R8: address to put result
TEXT runtime·indexbytebody(SB),NOSPLIT,$0
- MOVQ SI, DI
-
- CMPQ BX, $16
- JLT small
-
- CMPQ BX, $32
- JA avx2
-no_avx2:
- // round up to first 16-byte boundary
- TESTQ $15, SI
- JZ aligned
- MOVQ SI, CX
- ANDQ $~15, CX
- ADDQ $16, CX
-
- // search the beginning
- SUBQ SI, CX
- REPN; SCASB
- JZ success
-
-// DI is 16-byte aligned; get ready to search using SSE instructions
-aligned:
- // round down to last 16-byte boundary
- MOVQ BX, R11
- ADDQ SI, R11
- ANDQ $~15, R11
-
- // shuffle X0 around so that each byte contains c
+ // Shuffle X0 around so that each byte contains
+ // the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
- JMP condition
+
+ CMPQ BX, $16
+ JLT small
+
+ MOVQ SI, DI
+ CMPQ BX, $32
+ JA avx2
sse:
- // move the next 16-byte chunk of the buffer into X1
- MOVO (DI), X1
- // compare bytes in X0 to X1
- PCMPEQB X0, X1
- // take the top bit of each byte in X1 and put the result in DX
+ LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
+ JMP sseloopentry
+
+sseloop:
+ // Move the next 16-byte chunk of the data into X1.
+ MOVOU (DI), X1
+ // Compare bytes in X0 to X1.
+ PCMPEQB X0, X1
+ // Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX
- TESTL DX, DX
- JNZ ssesuccess
- ADDQ $16, DI
+ // Find first set bit, if any.
+ BSFL DX, DX
+ JNZ ssesuccess
+ // Advance to next block.
+ ADDQ $16, DI
+sseloopentry:
+ CMPQ DI, AX
+ JB sseloop
-condition:
- CMPQ DI, R11
- JLT sse
-
- // search the end
- MOVQ SI, CX
- ADDQ BX, CX
- SUBQ R11, CX
- // if CX == 0, the zero flag will be set and we'll end up
- // returning a false success
- JZ failure
- REPN; SCASB
- JZ success
+ // Search the last 16-byte chunk. This chunk may overlap with the
+ // chunks we've already searched, but that's ok.
+ MOVQ AX, DI
+ MOVOU (AX), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, DX
+ BSFL DX, DX
+ JNZ ssesuccess
failure:
MOVQ $-1, (R8)
RET
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+ SUBQ SI, DI // Compute offset of chunk within data.
+ ADDQ DX, DI // Add offset of byte within chunk.
+ MOVQ DI, (R8)
+ RET
+
// handle for lengths < 16
small:
- MOVQ BX, CX
- REPN; SCASB
- JZ success
- MOVQ $-1, (R8)
+ TESTQ BX, BX
+ JEQ failure
+
+ // Check if we'll load across a page boundary.
+ LEAQ 16(SI), AX
+ TESTW $0xff0, AX
+ JEQ endofpage
+
+ MOVOU (SI), X1 // Load data
+ PCMPEQB X0, X1 // Compare target byte with each byte in data.
+ PMOVMSKB X1, DX // Move result bits to integer register.
+ BSFL DX, DX // Find first set bit.
+ JZ failure // No set bit, failure.
+ CMPL DX, BX
+ JAE failure // Match is past end of data.
+ MOVQ DX, (R8)
+ RET
+
+endofpage:
+ MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
+ PCMPEQB X0, X1 // Compare target byte with each byte in data.
+ PMOVMSKB X1, DX // Move result bits to integer register.
+ MOVL BX, CX
+ SHLL CX, DX
+ SHRL $16, DX // Shift desired bits down to bottom of register.
+ BSFL DX, DX // Find first set bit.
+ JZ failure // No set bit, failure.
+ MOVQ DX, (R8)
RET
avx2:
CMPB runtime·support_avx2(SB), $1
- JNE no_avx2
+ JNE sse
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
VZEROUPPER
RET
-// we've found the chunk containing the byte
-// now just figure out which specific byte it is
-ssesuccess:
- // get the index of the least significant set bit
- BSFW DX, DX
- SUBQ SI, DI
- ADDQ DI, DX
- MOVQ DX, (R8)
- RET
-
-success:
- SUBQ SI, DI
- SUBL $1, DI
- MOVQ DI, (R8)
- RET
-
TEXT bytes·Equal(SB),NOSPLIT,$0-49
MOVQ a_len+8(FP), BX
MOVQ b_len+32(FP), CX