From 44f1854c9dc82d8dba415ef102e93896d57c2c0d Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Thu, 28 Apr 2016 17:34:24 +0300 Subject: [PATCH] bytes: Use the same algorithm as strings for Index MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit name old time/op new time/op delta IndexByte32-48 9.05ns ± 7% 9.59ns ±11% +5.93% (p=0.001 n=19+20) IndexByte4K-48 118ns ± 4% 122ns ± 8% +3.52% (p=0.002 n=19+19) IndexByte4M-48 172µs ±13% 188µs ±12% +9.49% (p=0.000 n=20+20) IndexByte64M-48 8.00ms ±14% 8.05ms ±23% ~ (p=0.799 n=20+20) IndexBytePortable32-48 41.7ns ±15% 42.5ns ±12% ~ (p=0.372 n=20+20) IndexBytePortable4K-48 3.08µs ±16% 3.26µs ±10% +5.77% (p=0.018 n=20+20) IndexBytePortable4M-48 3.12ms ±17% 3.20ms ±10% ~ (p=0.157 n=20+20) IndexBytePortable64M-48 54.0ms ±14% 55.3ms ±14% ~ (p=0.640 n=20+20) Index32-48 230ns ±12% 46ns ± 6% -79.87% (p=0.000 n=20+19) Index4K-48 43.2µs ± 9% 3.2µs ±12% -92.58% (p=0.000 n=19+20) Index4M-48 44.4ms ± 7% 3.3ms ±13% -92.59% (p=0.000 n=19+20) Index64M-48 714ms ±10% 56ms ± 8% -92.22% (p=0.000 n=19+19) IndexEasy32-48 52.7ns ±10% 31.0ns ±11% -41.21% (p=0.000 n=20+20) IndexEasy4K-48 139ns ± 5% 1598ns ± 6% +1046.37% (p=0.000 n=19+19) IndexEasy4M-48 179µs ± 8% 1674µs ±10% +834.31% (p=0.000 n=19+20) IndexEasy64M-48 8.56ms ±10% 27.82ms ±16% +225.14% (p=0.000 n=19+20) name old speed new speed delta IndexByte32-48 3.52GB/s ± 7% 3.35GB/s ±11% -4.99% (p=0.001 n=20+20) IndexByte4K-48 34.5GB/s ± 7% 33.2GB/s ±10% -3.67% (p=0.002 n=20+20) IndexByte4M-48 24.6GB/s ±14% 22.4GB/s ±14% -8.73% (p=0.000 n=20+20) IndexByte64M-48 8.42GB/s ±16% 8.42GB/s ±19% ~ (p=0.799 n=20+20) IndexBytePortable32-48 770MB/s ±13% 756MB/s ±11% ~ (p=0.383 n=20+20) IndexBytePortable4K-48 1.34GB/s ±14% 1.26GB/s ±10% -5.76% (p=0.018 n=20+20) IndexBytePortable4M-48 1.35GB/s ±15% 1.31GB/s ±11% ~ (p=0.157 n=20+20) IndexBytePortable64M-48 1.25GB/s ±16% 1.22GB/s ±13% ~ (p=0.640 n=20+20) Index32-48 138MB/s ± 8% 687MB/s ± 8% +398.57% (p=0.000 n=19+20) Index4K-48 94.9MB/s ± 9% 1280.5MB/s ±11% +1249.11% (p=0.000 n=19+20) Index4M-48 94.6MB/s ± 7% 1278.5MB/s ±12% +1250.99% (p=0.000 n=19+20) Index64M-48 94.2MB/s ±10% 1210.9MB/s ± 8% +1185.04% (p=0.000 n=19+19) IndexEasy32-48 608MB/s ±10% 1035MB/s ±10% +70.15% (p=0.000 n=20+20) IndexEasy4K-48 29.3GB/s ± 6% 2.6GB/s ± 6% -91.24% (p=0.000 n=19+19) IndexEasy4M-48 23.3GB/s ±10% 2.5GB/s ± 9% -89.23% (p=0.000 n=20+20) IndexEasy64M-48 7.86GB/s ±11% 2.42GB/s ±14% -69.18% (p=0.000 n=19+20) Change-Id: Ia191f0a6ca80e113397d9ed98d25f195768b65bc Reviewed-on: https://go-review.googlesource.com/22550 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/bytes/bytes.go | 31 ----------------- src/bytes/bytes_amd64.go | 69 ++++++++++++++++++++++++++++++++++++++ src/bytes/bytes_generic.go | 41 ++++++++++++++++++++++ src/runtime/asm_amd64.s | 26 +++++++++++--- 4 files changed, 132 insertions(+), 35 deletions(-) create mode 100644 src/bytes/bytes_amd64.go create mode 100644 src/bytes/bytes_generic.go diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go index 305c85d9f4..c35a1c0005 100644 --- a/src/bytes/bytes.go +++ b/src/bytes/bytes.go @@ -93,37 +93,6 @@ func ContainsRune(b []byte, r rune) bool { return IndexRune(b, r) >= 0 } -// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. -func Index(s, sep []byte) int { - n := len(sep) - if n == 0 { - return 0 - } - if n > len(s) { - return -1 - } - c := sep[0] - if n == 1 { - return IndexByte(s, c) - } - i := 0 - t := s[:len(s)-n+1] - for i < len(t) { - if t[i] != c { - o := IndexByte(t[i:], c) - if o < 0 { - break - } - i += o - } - if Equal(s[i:i+n], sep) { - return i - } - i++ - } - return -1 -} - func indexBytePortable(s []byte, c byte) int { for i, b := range s { if b == c { diff --git a/src/bytes/bytes_amd64.go b/src/bytes/bytes_amd64.go new file mode 100644 index 0000000000..e8be28b51d --- /dev/null +++ b/src/bytes/bytes_amd64.go @@ -0,0 +1,69 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package bytes + +// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s. +// indexShortStr requires 2 <= len(c) <= shortStringLen +func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s +const shortStringLen = 31 + +// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. +func Index(s, sep []byte) int { + n := len(sep) + switch { + case n == 0: + return 0 + case n == 1: + return IndexByte(s, sep[0]) + case n <= shortStringLen: + return indexShortStr(s, sep) + case n == len(s): + if Equal(sep, s) { + return 0 + } + return -1 + case n > len(s): + return -1 + } + // Rabin-Karp search + hashsep, pow := hashStr(sep) + var h uint32 + for i := 0; i < n; i++ { + h = h*primeRK + uint32(s[i]) + } + if h == hashsep && Equal(s[:n], sep) { + return 0 + } + for i := n; i < len(s); { + h *= primeRK + h += uint32(s[i]) + h -= pow * uint32(s[i-n]) + i++ + if h == hashsep && Equal(s[i-n:i], sep) { + return i - n + } + } + return -1 +} + +// primeRK is the prime base used in Rabin-Karp algorithm. +const primeRK = 16777619 + +// hashStr returns the hash and the appropriate multiplicative +// factor for use in Rabin-Karp algorithm. +func hashStr(sep []byte) (uint32, uint32) { + hash := uint32(0) + for i := 0; i < len(sep); i++ { + hash = hash*primeRK + uint32(sep[i]) + } + var pow, sq uint32 = 1, primeRK + for i := len(sep); i > 0; i >>= 1 { + if i&1 != 0 { + pow *= sq + } + sq *= sq + } + return hash, pow +} diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go new file mode 100644 index 0000000000..88e232eccf --- /dev/null +++ b/src/bytes/bytes_generic.go @@ -0,0 +1,41 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 + +package bytes + +// TODO: implements short string optimization on non amd64 platforms +// and get rid of bytes_amd64.go + +// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. +func Index(s, sep []byte) int { + n := len(sep) + if n == 0 { + return 0 + } + if n > len(s) { + return -1 + } + c := sep[0] + if n == 1 { + return IndexByte(s, c) + } + i := 0 + t := s[:len(s)-n+1] + for i < len(t) { + if t[i] != c { + o := IndexByte(t[i:], c) + if o < 0 { + break + } + i += o + } + if Equal(s[i:i+n], sep) { + return i + } + i++ + } + return -1 +} diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index f44fc1166a..c9d6b90d80 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1695,13 +1695,31 @@ big_loop_avx2_exit: JMP loop -// TODO: Also use this in bytes.Index TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 MOVQ s+0(FP), DI // We want len in DX and AX, because PCMPESTRI implicitly consumes them MOVQ s_len+8(FP), DX MOVQ c+16(FP), BP MOVQ c_len+24(FP), AX + MOVQ DI, R10 + LEAQ ret+32(FP), R11 + JMP runtime·indexShortStr(SB) + +TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56 + MOVQ s+0(FP), DI + MOVQ s_len+8(FP), DX + MOVQ c+24(FP), BP + MOVQ c_len+32(FP), AX + MOVQ DI, R10 + LEAQ ret+48(FP), R11 + JMP runtime·indexShortStr(SB) + +// AX: length of string, that we are searching for +// DX: length of string, in which we are searching +// DI: pointer to string, in which we are searching +// BP: pointer to string, that we are searching for +// R11: address, where to put return value +TEXT runtime·indexShortStr(SB),NOSPLIT,$0 CMPQ AX, DX JA fail CMPQ DX, $16 @@ -1853,7 +1871,7 @@ partial_success17to31: CMPQ DI,DX JB loop17to31 fail: - MOVQ $-1, ret+32(FP) + MOVQ $-1, (R11) RET sse42: MOVL runtime·cpuid_ecx(SB), CX @@ -1893,8 +1911,8 @@ loop_sse42: sse42_success: ADDQ CX, DI success: - SUBQ s+0(FP), DI - MOVQ DI, ret+32(FP) + SUBQ R10, DI + MOVQ DI, (R11) RET -- 2.48.1