]> Cypherpunks repositories - gostls13.git/commitdiff
bytes: SSE for bytes.IndexByte on amd64
authorEvan Shaw <chickencha@gmail.com>
Tue, 9 Nov 2010 01:33:53 +0000 (17:33 -0800)
committerRob Pike <r@golang.org>
Tue, 9 Nov 2010 01:33:53 +0000 (17:33 -0800)
Performance on 2.8 GHz Intel Core i7:

Before:
BenchmarkIndexByte4K  1000000              2997 ns/op        1366.70 MB/s
BenchmarkIndexByte4M      500           3049772 ns/op        1375.28 MB/s
BenchmarkIndexByte64M      50          49582280 ns/op        1353.48 MB/s

After:
BenchmarkIndexByte4K 10000000               298 ns/op       13744.97 MB/s
BenchmarkIndexByte4M    10000            285993 ns/op       14665.76 MB/s
BenchmarkIndexByte64M     500           4618172 ns/op       14531.48 MB/s

R=rsc, PeterGo, r2, r
CC=golang-dev
https://golang.org/cl/2888041

src/pkg/bytes/asm_amd64.s
src/pkg/bytes/bytes_test.go

index 7e78700ecfb3a31fef8b602ffcb4bf46f593a791..c6793cbdcc7f02d6774110da8befbf638f50c64b 100644 (file)
@@ -3,15 +3,90 @@
 // license that can be found in the LICENSE file.
 
 TEXT ·IndexByte(SB),7,$0
-       MOVQ    p+0(FP), SI
-       MOVL    len+8(FP), CX
-       MOVB    b+16(FP), AL
-       MOVQ    SI, DI
+       MOVQ p+0(FP), SI
+       MOVL len+8(FP), BX
+       MOVB b+16(FP), AL
+       MOVQ SI, DI
+
+       CMPL BX, $16
+       JLT small
+
+       // round up to first 16-byte boundary
+       TESTQ $15, SI
+       JZ aligned
+       MOVQ SI, CX
+       ANDQ $~15, CX
+       ADDQ $16, CX
+
+       // search the beginning
+       SUBQ SI, CX
+       REPN; SCASB
+       JZ success
+
+// DI is 16-byte aligned; get ready to search using SSE instructions
+aligned:
+       // round down to last 16-byte boundary
+       MOVQ BX, R11
+       ADDQ SI, R11
+       ANDQ $~15, R11
+
+       // shuffle X0 around so that each byte contains c
+       MOVD AX, X0
+       PUNPCKLBW X0, X0
+       PUNPCKLBW X0, X0
+       PSHUFL $0, X0, X0
+       JMP condition
+
+sse:
+       // move the next 16-byte chunk of the buffer into X1
+       MOVO (DI), X1
+       // compare bytes in X0 to X1
+       PCMPEQB X0, X1
+       // take the top bit of each byte in X1 and put the result in DX
+       PMOVMSKB X1, DX
+       TESTL DX, DX
+       JNZ ssesuccess
+       ADDQ $16, DI
+
+condition:
+       CMPQ DI, R11
+       JLT sse
+
+       // search the end
+       MOVQ SI, CX
+       ADDQ BX, CX
+       SUBQ R11, CX
+       // if CX == 0, the zero flag will be set and we'll end up
+       // returning a false success
+       JZ failure
        REPN; SCASB
-       JZ 3(PC)
-       MOVL    $-1, ret+24(FP)
+       JZ success
+
+failure:
+       MOVL $-1, ret+24(FP)
+       RET
+
+// handle for lengths < 16
+small:
+       MOVL BX, CX
+       REPN; SCASB
+       JZ success
+       MOVL $-1, ret+24(FP)
        RET
-       SUBQ    SI, DI
-       SUBL    $1, DI
-       MOVL    DI, ret+24(FP)
+
+// we've found the chunk containing the byte
+// now just figure out which specific byte it is
+ssesuccess:
+       // get the index of the least significant set bit
+       BSFW DX, DX
+       SUBQ SI, DI
+       ADDQ DI, DX
+       MOVL DX, ret+24(FP)
+       RET
+
+success:
+       SUBQ SI, DI
+       SUBL $1, DI
+       MOVL DI, ret+24(FP)
        RET
+
index 6f42338eb8e2c30795abb141dbff78f9975f46d0..f3ca371f83e76f48b20b7d8f2474f004e89ba6c2 100644 (file)
@@ -93,6 +93,9 @@ var indexTests = []BinOpTest{
        {"abc", "b", 1},
        {"abc", "c", 2},
        {"abc", "x", -1},
+       {"barfoobarfooyyyzzzyyyzzzyyyzzzyyyxxxzzzyyy", "x", 33},
+       {"foofyfoobarfoobar", "y", 4},
+       {"oooooooooooooooooooooo", "r", -1},
 }
 
 var lastIndexTests = []BinOpTest{
@@ -177,6 +180,56 @@ func TestIndexByte(t *testing.T) {
        }
 }
 
+// test a larger buffer with different sizes and alignments
+func TestIndexByteBig(t *testing.T) {
+       const n = 1024
+       b := make([]byte, n)
+       for i := 0; i < n; i++ {
+               // different start alignments
+               b1 := b[i:]
+               for j := 0; j < len(b1); j++ {
+                       b1[j] = 'x'
+                       pos := IndexByte(b1, 'x')
+                       if pos != j {
+                               t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+                       }
+                       b1[j] = 0
+                       pos = IndexByte(b1, 'x')
+                       if pos != -1 {
+                               t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+                       }
+               }
+               // different end alignments
+               b1 = b[:i]
+               for j := 0; j < len(b1); j++ {
+                       b1[j] = 'x'
+                       pos := IndexByte(b1, 'x')
+                       if pos != j {
+                               t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+                       }
+                       b1[j] = 0
+                       pos = IndexByte(b1, 'x')
+                       if pos != -1 {
+                               t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+                       }
+               }
+               // different start and end alignments
+               b1 = b[i/2 : n-(i+1)/2]
+               for j := 0; j < len(b1); j++ {
+                       b1[j] = 'x'
+                       pos := IndexByte(b1, 'x')
+                       if pos != j {
+                               t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+                       }
+                       b1[j] = 0
+                       pos = IndexByte(b1, 'x')
+                       if pos != -1 {
+                               t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
+                       }
+               }
+       }
+}
+
 func TestIndexRune(t *testing.T) {
        for _, tt := range indexRuneTests {
                a := []byte(tt.a)