From: Keith Randall Date: Sat, 16 Jan 2016 02:17:09 +0000 (-0800) Subject: runtime: avoid using REP prefix for IndexByte X-Git-Tag: go1.7beta1~1706 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=687abca1ea828dd4745d50c351f3b73ccd4d09be;p=gostls13.git runtime: avoid using REP prefix for IndexByte REP-prefixed instructions have a large startup cost. Avoid them like the plague. benchmark old ns/op new ns/op delta BenchmarkIndexByte10-8 22.4 5.34 -76.16% Fixes #13983 Change-Id: I857e956e240fc9681d053f2584ccf24c1b272bb3 Reviewed-on: https://go-review.googlesource.com/18703 Reviewed-by: Minux Ma Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot --- diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go index 8df62fcc6a..a412dc89b9 100644 --- a/src/bytes/bytes_test.go +++ b/src/bytes/bytes_test.go @@ -335,6 +335,41 @@ func TestIndexByteBig(t *testing.T) { } } +// test a small index across all page offsets +func TestIndexByteSmall(t *testing.T) { + b := make([]byte, 5015) // bigger than a page + // Make sure we find the correct byte even when straddling a page. + for i := 0; i <= len(b)-15; i++ { + for j := 0; j < 15; j++ { + b[i+j] = byte(100 + j) + } + for j := 0; j < 15; j++ { + p := IndexByte(b[i:i+15], byte(100+j)) + if p != j { + t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 100+j, p) + } + } + for j := 0; j < 15; j++ { + b[i+j] = 0 + } + } + // Make sure matches outside the slice never trigger. + for i := 0; i <= len(b)-15; i++ { + for j := 0; j < 15; j++ { + b[i+j] = 1 + } + for j := 0; j < 15; j++ { + p := IndexByte(b[i:i+15], byte(0)) + if p != -1 { + t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 0, p) + } + } + for j := 0; j < 15; j++ { + b[i+j] = 0 + } + } +} + func TestIndexRune(t *testing.T) { for _, tt := range indexRuneTests { a := []byte(tt.a) @@ -348,10 +383,12 @@ func TestIndexRune(t *testing.T) { var bmbuf []byte +func BenchmarkIndexByte10(b *testing.B) { bmIndexByte(b, IndexByte, 10) } func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) } func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) } func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) } func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) } +func BenchmarkIndexBytePortable10(b *testing.B) { bmIndexByte(b, IndexBytePortable, 10) } func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) } func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) } func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) } diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 98a8e839ed..ac4630c833 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1838,80 +1838,98 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0-32 // AL: byte sought // R8: address to put result TEXT runtime·indexbytebody(SB),NOSPLIT,$0 - MOVQ SI, DI - - CMPQ BX, $16 - JLT small - - CMPQ BX, $32 - JA avx2 -no_avx2: - // round up to first 16-byte boundary - TESTQ $15, SI - JZ aligned - MOVQ SI, CX - ANDQ $~15, CX - ADDQ $16, CX - - // search the beginning - SUBQ SI, CX - REPN; SCASB - JZ success - -// DI is 16-byte aligned; get ready to search using SSE instructions -aligned: - // round down to last 16-byte boundary - MOVQ BX, R11 - ADDQ SI, R11 - ANDQ $~15, R11 - - // shuffle X0 around so that each byte contains c + // Shuffle X0 around so that each byte contains + // the character we're looking for. MOVD AX, X0 PUNPCKLBW X0, X0 PUNPCKLBW X0, X0 PSHUFL $0, X0, X0 - JMP condition + + CMPQ BX, $16 + JLT small + + MOVQ SI, DI + CMPQ BX, $32 + JA avx2 sse: - // move the next 16-byte chunk of the buffer into X1 - MOVO (DI), X1 - // compare bytes in X0 to X1 - PCMPEQB X0, X1 - // take the top bit of each byte in X1 and put the result in DX + LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes + JMP sseloopentry + +sseloop: + // Move the next 16-byte chunk of the data into X1. + MOVOU (DI), X1 + // Compare bytes in X0 to X1. + PCMPEQB X0, X1 + // Take the top bit of each byte in X1 and put the result in DX. PMOVMSKB X1, DX - TESTL DX, DX - JNZ ssesuccess - ADDQ $16, DI + // Find first set bit, if any. + BSFL DX, DX + JNZ ssesuccess + // Advance to next block. + ADDQ $16, DI +sseloopentry: + CMPQ DI, AX + JB sseloop -condition: - CMPQ DI, R11 - JLT sse - - // search the end - MOVQ SI, CX - ADDQ BX, CX - SUBQ R11, CX - // if CX == 0, the zero flag will be set and we'll end up - // returning a false success - JZ failure - REPN; SCASB - JZ success + // Search the last 16-byte chunk. This chunk may overlap with the + // chunks we've already searched, but that's ok. + MOVQ AX, DI + MOVOU (AX), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, DX + BSFL DX, DX + JNZ ssesuccess failure: MOVQ $-1, (R8) RET +// We've found a chunk containing the byte. +// The chunk was loaded from DI. +// The index of the matching byte in the chunk is DX. +// The start of the data is SI. +ssesuccess: + SUBQ SI, DI // Compute offset of chunk within data. + ADDQ DX, DI // Add offset of byte within chunk. + MOVQ DI, (R8) + RET + // handle for lengths < 16 small: - MOVQ BX, CX - REPN; SCASB - JZ success - MOVQ $-1, (R8) + TESTQ BX, BX + JEQ failure + + // Check if we'll load across a page boundary. + LEAQ 16(SI), AX + TESTW $0xff0, AX + JEQ endofpage + + MOVOU (SI), X1 // Load data + PCMPEQB X0, X1 // Compare target byte with each byte in data. + PMOVMSKB X1, DX // Move result bits to integer register. + BSFL DX, DX // Find first set bit. + JZ failure // No set bit, failure. + CMPL DX, BX + JAE failure // Match is past end of data. + MOVQ DX, (R8) + RET + +endofpage: + MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. + PCMPEQB X0, X1 // Compare target byte with each byte in data. + PMOVMSKB X1, DX // Move result bits to integer register. + MOVL BX, CX + SHLL CX, DX + SHRL $16, DX // Shift desired bits down to bottom of register. + BSFL DX, DX // Find first set bit. + JZ failure // No set bit, failure. + MOVQ DX, (R8) RET avx2: CMPB runtime·support_avx2(SB), $1 - JNE no_avx2 + JNE sse MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 @@ -1941,22 +1959,6 @@ avx2success: VZEROUPPER RET -// we've found the chunk containing the byte -// now just figure out which specific byte it is -ssesuccess: - // get the index of the least significant set bit - BSFW DX, DX - SUBQ SI, DI - ADDQ DI, DX - MOVQ DX, (R8) - RET - -success: - SUBQ SI, DI - SUBL $1, DI - MOVQ DI, (R8) - RET - TEXT bytes·Equal(SB),NOSPLIT,$0-49 MOVQ a_len+8(FP), BX MOVQ b_len+32(FP), CX