From d206af1e6c53df0c59d9466fe9c50415f9d8dcd5 Mon Sep 17 00:00:00 2001 From: Josselin Costanzi Date: Mon, 27 Mar 2017 13:22:59 +0200 Subject: [PATCH] strings: optimize Count for amd64 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Move optimized Count implementation from bytes to runtime. Use in both bytes and strings packages. Add CountByte benchmark to strings. Strings benchmarks: name old time/op new time/op delta CountHard1-4 226µs ± 1% 226µs ± 2% ~ (p=0.247 n=10+10) CountHard2-4 316µs ± 1% 315µs ± 0% ~ (p=0.133 n=9+10) CountHard3-4 919µs ± 1% 920µs ± 1% ~ (p=0.968 n=10+9) CountTorture-4 15.4µs ± 1% 15.7µs ± 1% +2.47% (p=0.000 n=10+9) CountTortureOverlapping-4 9.60ms ± 0% 9.65ms ± 1% ~ (p=0.247 n=10+10) CountByte/10-4 26.3ns ± 1% 10.9ns ± 1% -58.71% (p=0.000 n=9+9) CountByte/32-4 42.7ns ± 0% 14.2ns ± 0% -66.64% (p=0.000 n=10+10) CountByte/4096-4 3.07µs ± 0% 0.31µs ± 2% -89.99% (p=0.000 n=9+10) CountByte/4194304-4 3.48ms ± 1% 0.34ms ± 1% -90.09% (p=0.000 n=10+9) CountByte/67108864-4 55.6ms ± 1% 7.0ms ± 0% -87.49% (p=0.000 n=9+8) name old speed new speed delta CountByte/10-4 380MB/s ± 1% 919MB/s ± 1% +142.21% (p=0.000 n=9+9) CountByte/32-4 750MB/s ± 0% 2247MB/s ± 0% +199.62% (p=0.000 n=10+10) CountByte/4096-4 1.33GB/s ± 0% 13.32GB/s ± 2% +898.13% (p=0.000 n=9+10) CountByte/4194304-4 1.21GB/s ± 1% 12.17GB/s ± 1% +908.87% (p=0.000 n=10+9) CountByte/67108864-4 1.21GB/s ± 1% 9.65GB/s ± 0% +699.29% (p=0.000 n=9+8) Fixes #19411 Change-Id: I8d2d409f0fa6df6d03b60790aa86e540b4a4e3b0 Reviewed-on: https://go-review.googlesource.com/38693 Reviewed-by: Keith Randall --- src/bytes/bytes_amd64.go | 10 +- src/bytes/bytes_amd64.s | 183 -------------------------- src/cmd/vet/all/whitelist/amd64.txt | 3 + src/runtime/asm_amd64.s | 195 ++++++++++++++++++++++++++++ src/strings/strings.go | 5 +- src/strings/strings_amd64.go | 15 ++- src/strings/strings_generic.go | 6 + src/strings/strings_s390x.go | 6 + src/strings/strings_test.go | 18 +++ 9 files changed, 247 insertions(+), 194 deletions(-) delete mode 100644 src/bytes/bytes_amd64.s diff --git a/src/bytes/bytes_amd64.go b/src/bytes/bytes_amd64.go index ac9c002d6d..e68a3920d0 100644 --- a/src/bytes/bytes_amd64.go +++ b/src/bytes/bytes_amd64.go @@ -8,9 +8,10 @@ package bytes // indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s. // indexShortStr requires 2 <= len(c) <= shortStringLen -func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s -func supportAVX2() bool // ../runtime/asm_$GOARCH.s -func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s +func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s +func supportAVX2() bool // ../runtime/asm_$GOARCH.s +func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s +func countByte(s []byte, c byte) int // ../runtime/asm_$GOARCH.s var shortStringLen int @@ -95,9 +96,6 @@ func Index(s, sep []byte) int { return -1 } -// Special case for when we must count occurrences of a single byte. -func countByte(s []byte, c byte) int - // Count counts the number of non-overlapping instances of sep in s. // If sep is an empty slice, Count returns 1 + the number of Unicode code points in s. func Count(s, sep []byte) int { diff --git a/src/bytes/bytes_amd64.s b/src/bytes/bytes_amd64.s deleted file mode 100644 index a710e22510..0000000000 --- a/src/bytes/bytes_amd64.s +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// We use: -// SI: data -// BX: data len -// AL: byte sought -// This requires the POPCNT instruction -TEXT ·countByte(SB),NOSPLIT,$0-40 - MOVQ s+0(FP), SI - MOVQ s_len+8(FP), BX - MOVB c+24(FP), AL - - // Shuffle X0 around so that each byte contains - // the character we're looking for. - MOVD AX, X0 - PUNPCKLBW X0, X0 - PUNPCKLBW X0, X0 - PSHUFL $0, X0, X0 - - CMPQ BX, $16 - JLT small - - MOVQ $0, R12 // Accumulator - - MOVQ SI, DI - - CMPQ BX, $32 - JA avx2 -sse: - LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes - JMP sseloopentry - -sseloop: - // Move the next 16-byte chunk of the data into X1. - MOVOU (DI), X1 - // Compare bytes in X0 to X1. - PCMPEQB X0, X1 - // Take the top bit of each byte in X1 and put the result in DX. - PMOVMSKB X1, DX - // Count number of matching bytes - POPCNTL DX, DX - // Accumulate into R12 - ADDQ DX, R12 - // Advance to next block. - ADDQ $16, DI -sseloopentry: - CMPQ DI, AX - JBE sseloop - - // Get the number of bytes to consider in the last 16 bytes - ANDQ $15, BX - JZ end - - // Create mask to ignore overlap between previous 16 byte block - // and the next. - MOVQ $16,CX - SUBQ BX, CX - MOVQ $0xFFFF, R10 - SARQ CL, R10 - SALQ CL, R10 - - // Process the last 16-byte chunk. This chunk may overlap with the - // chunks we've already searched so we need to mask part of it. - MOVOU (AX), X1 - PCMPEQB X0, X1 - PMOVMSKB X1, DX - // Apply mask - ANDQ R10, DX - POPCNTL DX, DX - ADDQ DX, R12 -end: - MOVQ R12, ret+32(FP) - RET - -// handle for lengths < 16 -small: - TESTQ BX, BX - JEQ endzero - - // Check if we'll load across a page boundary. - LEAQ 16(SI), AX - TESTW $0xff0, AX - JEQ endofpage - - // We must ignore high bytes as they aren't part of our slice. - // Create mask. - MOVB BX, CX - MOVQ $1, R10 - SALQ CL, R10 - SUBQ $1, R10 - - // Load data - MOVOU (SI), X1 - // Compare target byte with each byte in data. - PCMPEQB X0, X1 - // Move result bits to integer register. - PMOVMSKB X1, DX - // Apply mask - ANDQ R10, DX - POPCNTL DX, DX - // Directly return DX, we don't need to accumulate - // since we have <16 bytes. - MOVQ DX, ret+32(FP) - RET -endzero: - MOVQ $0, ret+32(FP) - RET - -endofpage: - // We must ignore low bytes as they aren't part of our slice. - MOVQ $16,CX - SUBQ BX, CX - MOVQ $0xFFFF, R10 - SARQ CL, R10 - SALQ CL, R10 - - // Load data into the high end of X1. - MOVOU -16(SI)(BX*1), X1 - // Compare target byte with each byte in data. - PCMPEQB X0, X1 - // Move result bits to integer register. - PMOVMSKB X1, DX - // Apply mask - ANDQ R10, DX - // Directly return DX, we don't need to accumulate - // since we have <16 bytes. - POPCNTL DX, DX - MOVQ DX, ret+32(FP) - RET - -avx2: - CMPB runtime·support_avx2(SB), $1 - JNE sse - MOVD AX, X0 - LEAQ -32(SI)(BX*1), R11 - VPBROADCASTB X0, Y1 -avx2_loop: - VMOVDQU (DI), Y2 - VPCMPEQB Y1, Y2, Y3 - VPMOVMSKB Y3, DX - POPCNTL DX, DX - ADDQ DX, R12 - ADDQ $32, DI - CMPQ DI, R11 - JLE avx2_loop - - // If last block is already processed, - // skip to the end. - CMPQ DI, R11 - JEQ endavx - - // Load address of the last 32 bytes. - // There is an overlap with the previous block. - MOVQ R11, DI - VMOVDQU (DI), Y2 - VPCMPEQB Y1, Y2, Y3 - VPMOVMSKB Y3, DX - // Exit AVX mode. - VZEROUPPER - - // Create mask to ignore overlap between previous 32 byte block - // and the next. - ANDQ $31, BX - MOVQ $32,CX - SUBQ BX, CX - MOVQ $0xFFFFFFFF, R10 - SARQ CL, R10 - SALQ CL, R10 - // Apply mask - ANDQ R10, DX - POPCNTL DX, DX - ADDQ DX, R12 - MOVQ R12, ret+32(FP) - RET -endavx: - // Exit AVX mode. - VZEROUPPER - MOVQ R12, ret+32(FP) - RET diff --git a/src/cmd/vet/all/whitelist/amd64.txt b/src/cmd/vet/all/whitelist/amd64.txt index 92a693af83..9056f809f2 100644 --- a/src/cmd/vet/all/whitelist/amd64.txt +++ b/src/cmd/vet/all/whitelist/amd64.txt @@ -17,7 +17,10 @@ runtime/asm_amd64.s: [GOARCH] cannot check cross-package assembly function: Comp runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportAVX2 is in package strings runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportAVX2 is in package bytes +runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportPOPCNT is in package strings runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportPOPCNT is in package bytes +runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: countByte is in package strings +runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: countByte is in package bytes // Intentionally missing declarations. These are special assembly routines. // Some are jumped into from other routines, with values in specific registers. diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index c0a5048eda..36da4cc922 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1702,6 +1702,11 @@ TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1 MOVB AX, ret+0(FP) RET +TEXT strings·supportPOPCNT(SB),NOSPLIT,$0-1 + MOVBLZX runtime·support_popcnt(SB), AX + MOVB AX, ret+0(FP) + RET + TEXT bytes·supportPOPCNT(SB),NOSPLIT,$0-1 MOVBLZX runtime·support_popcnt(SB), AX MOVB AX, ret+0(FP) @@ -2131,6 +2136,196 @@ eqret: MOVB $0, ret+48(FP) RET + +TEXT bytes·countByte(SB),NOSPLIT,$0-40 + MOVQ s+0(FP), SI + MOVQ s_len+8(FP), BX + MOVB c+24(FP), AL + LEAQ ret+32(FP), R8 + JMP runtime·countByte(SB) + +TEXT strings·countByte(SB),NOSPLIT,$0-32 + MOVQ s+0(FP), SI + MOVQ s_len+8(FP), BX + MOVB c+16(FP), AL + LEAQ ret+24(FP), R8 + JMP runtime·countByte(SB) + +// input: +// SI: data +// BX: data len +// AL: byte sought +// R8: address to put result +// This requires the POPCNT instruction +TEXT runtime·countByte(SB),NOSPLIT,$0 + // Shuffle X0 around so that each byte contains + // the character we're looking for. + MOVD AX, X0 + PUNPCKLBW X0, X0 + PUNPCKLBW X0, X0 + PSHUFL $0, X0, X0 + + CMPQ BX, $16 + JLT small + + MOVQ $0, R12 // Accumulator + + MOVQ SI, DI + + CMPQ BX, $32 + JA avx2 +sse: + LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes + JMP sseloopentry + +sseloop: + // Move the next 16-byte chunk of the data into X1. + MOVOU (DI), X1 + // Compare bytes in X0 to X1. + PCMPEQB X0, X1 + // Take the top bit of each byte in X1 and put the result in DX. + PMOVMSKB X1, DX + // Count number of matching bytes + POPCNTL DX, DX + // Accumulate into R12 + ADDQ DX, R12 + // Advance to next block. + ADDQ $16, DI +sseloopentry: + CMPQ DI, AX + JBE sseloop + + // Get the number of bytes to consider in the last 16 bytes + ANDQ $15, BX + JZ end + + // Create mask to ignore overlap between previous 16 byte block + // and the next. + MOVQ $16,CX + SUBQ BX, CX + MOVQ $0xFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + + // Process the last 16-byte chunk. This chunk may overlap with the + // chunks we've already searched so we need to mask part of it. + MOVOU (AX), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + ADDQ DX, R12 +end: + MOVQ R12, (R8) + RET + +// handle for lengths < 16 +small: + TESTQ BX, BX + JEQ endzero + + // Check if we'll load across a page boundary. + LEAQ 16(SI), AX + TESTW $0xff0, AX + JEQ endofpage + + // We must ignore high bytes as they aren't part of our slice. + // Create mask. + MOVB BX, CX + MOVQ $1, R10 + SALQ CL, R10 + SUBQ $1, R10 + + // Load data + MOVOU (SI), X1 + // Compare target byte with each byte in data. + PCMPEQB X0, X1 + // Move result bits to integer register. + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + // Directly return DX, we don't need to accumulate + // since we have <16 bytes. + MOVQ DX, (R8) + RET +endzero: + MOVQ $0, (R8) + RET + +endofpage: + // We must ignore low bytes as they aren't part of our slice. + MOVQ $16,CX + SUBQ BX, CX + MOVQ $0xFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + + // Load data into the high end of X1. + MOVOU -16(SI)(BX*1), X1 + // Compare target byte with each byte in data. + PCMPEQB X0, X1 + // Move result bits to integer register. + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + // Directly return DX, we don't need to accumulate + // since we have <16 bytes. + POPCNTL DX, DX + MOVQ DX, (R8) + RET + +avx2: + CMPB runtime·support_avx2(SB), $1 + JNE sse + MOVD AX, X0 + LEAQ -32(SI)(BX*1), R11 + VPBROADCASTB X0, Y1 +avx2_loop: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, DX + POPCNTL DX, DX + ADDQ DX, R12 + ADDQ $32, DI + CMPQ DI, R11 + JLE avx2_loop + + // If last block is already processed, + // skip to the end. + CMPQ DI, R11 + JEQ endavx + + // Load address of the last 32 bytes. + // There is an overlap with the previous block. + MOVQ R11, DI + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, DX + // Exit AVX mode. + VZEROUPPER + + // Create mask to ignore overlap between previous 32 byte block + // and the next. + ANDQ $31, BX + MOVQ $32,CX + SUBQ BX, CX + MOVQ $0xFFFFFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + ADDQ DX, R12 + MOVQ R12, (R8) + RET +endavx: + // Exit AVX mode. + VZEROUPPER + MOVQ R12, (R8) + RET + TEXT runtime·return0(SB), NOSPLIT, $0 MOVL $0, AX RET diff --git a/src/strings/strings.go b/src/strings/strings.go index a01eb698c4..d3bfe1f729 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -72,9 +72,8 @@ func hashStrRev(sep string) (uint32, uint32) { return hash, pow } -// Count counts the number of non-overlapping instances of substr in s. -// If substr is an empty string, Count returns 1 + the number of Unicode code points in s. -func Count(s, substr string) int { +// countGeneric implements Count. +func countGeneric(s, substr string) int { // special case if len(substr) == 0 { return utf8.RuneCountInString(s) + 1 diff --git a/src/strings/strings_amd64.go b/src/strings/strings_amd64.go index 8f6ac1de74..33771480a6 100644 --- a/src/strings/strings_amd64.go +++ b/src/strings/strings_amd64.go @@ -8,8 +8,10 @@ package strings // indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s. // indexShortStr requires 2 <= len(c) <= shortStringLen -func indexShortStr(s, c string) int // ../runtime/asm_$GOARCH.s -func supportAVX2() bool // ../runtime/asm_$GOARCH.s +func indexShortStr(s, c string) int // ../runtime/asm_$GOARCH.s +func supportAVX2() bool // ../runtime/asm_$GOARCH.s +func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s +func countByte(s string, c byte) int // ../runtime/asm_$GOARCH.s var shortStringLen int @@ -93,3 +95,12 @@ func Index(s, substr string) int { } return -1 } + +// Count counts the number of non-overlapping instances of substr in s. +// If substr is an empty string, Count returns 1 + the number of Unicode code points in s. +func Count(s, substr string) int { + if len(substr) == 1 && supportPOPCNT() { + return countByte(s, byte(substr[0])) + } + return countGeneric(s, substr) +} diff --git a/src/strings/strings_generic.go b/src/strings/strings_generic.go index 873d75ee1c..5429a74a22 100644 --- a/src/strings/strings_generic.go +++ b/src/strings/strings_generic.go @@ -45,3 +45,9 @@ func Index(s, substr string) int { } return -1 } + +// Count counts the number of non-overlapping instances of substr in s. +// If substr is an empty string, Count returns 1 + the number of Unicode code points in s. +func Count(s, substr string) int { + return countGeneric(s, substr) +} diff --git a/src/strings/strings_s390x.go b/src/strings/strings_s390x.go index 32520459be..ccf2da632d 100644 --- a/src/strings/strings_s390x.go +++ b/src/strings/strings_s390x.go @@ -96,3 +96,9 @@ func Index(s, substr string) int { } return -1 } + +// Count counts the number of non-overlapping instances of substr in s. +// If substr is an empty string, Count returns 1 + the number of Unicode code points in s. +func Count(s, substr string) int { + return countGeneric(s, substr) +} diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go index 58314a6868..869be9c477 100644 --- a/src/strings/strings_test.go +++ b/src/strings/strings_test.go @@ -1457,6 +1457,24 @@ func BenchmarkCountTortureOverlapping(b *testing.B) { } } +func BenchmarkCountByte(b *testing.B) { + indexSizes := []int{10, 32, 4 << 10, 4 << 20, 64 << 20} + benchStr := Repeat(benchmarkString, + (indexSizes[len(indexSizes)-1]+len(benchmarkString)-1)/len(benchmarkString)) + benchFunc := func(b *testing.B, benchStr string) { + b.SetBytes(int64(len(benchStr))) + for i := 0; i < b.N; i++ { + Count(benchStr, "=") + } + } + for _, size := range indexSizes { + b.Run(fmt.Sprintf("%d", size), func(b *testing.B) { + benchFunc(b, benchStr[:size]) + }) + } + +} + var makeFieldsInput = func() string { x := make([]byte, 1<<20) // Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space. -- 2.48.1