// If r is [utf8.RuneError], it returns the first instance of any
// invalid UTF-8 byte sequence.
func IndexRune(s []byte, r rune) int {
+ const haveFastIndex = bytealg.MaxBruteForce > 0
switch {
case 0 <= r && r < utf8.RuneSelf:
return IndexByte(s, byte(r))
case !utf8.ValidRune(r):
return -1
default:
+ // Search for rune r using the last byte of its UTF-8 encoded form.
+ // The distribution of the last byte is more uniform compared to the
+ // first byte which has a 78% chance of being [240, 243, 244].
var b [utf8.UTFMax]byte
n := utf8.EncodeRune(b[:], r)
- return Index(s, b[:n])
+ last := n - 1
+ i := last
+ fails := 0
+ for i < len(s) {
+ if s[i] != b[last] {
+ o := IndexByte(s[i+1:], b[last])
+ if o < 0 {
+ return -1
+ }
+ i += o + 1
+ }
+ // Step backwards comparing bytes.
+ for j := 1; j < n; j++ {
+ if s[i-j] != b[last-j] {
+ goto next
+ }
+ }
+ return i - last
+ next:
+ fails++
+ i++
+ if (haveFastIndex && fails > bytealg.Cutover(i)) && i < len(s) ||
+ (!haveFastIndex && fails >= 4+i>>4 && i < len(s)) {
+ goto fallback
+ }
+ }
+ return -1
+
+ fallback:
+ // Switch to bytealg.Index, if available, or a brute for search when
+ // IndexByte returns too many false positives.
+ if haveFastIndex {
+ if j := bytealg.Index(s[i-last:], b[:n]); j >= 0 {
+ return i + j - last
+ }
+ } else {
+ // If bytealg.Index is not available a brute force search is
+ // ~1.5-3x faster than Rabin-Karp since n is small.
+ c0 := b[last]
+ c1 := b[last-1] // There are at least 2 chars to match
+ loop:
+ for ; i < len(s); i++ {
+ if s[i] == c0 && s[i-1] == c1 {
+ for k := 2; k < n; k++ {
+ if s[i-k] != b[last-k] {
+ continue loop
+ }
+ }
+ return i - last
+ }
+ }
+ }
+ return -1
}
}
{"oxoxoxoxoxoxoxoxoxoxoxox", "oy", -1},
// test fallback to Rabin-Karp.
{"000000000000000000000000000000000000000000000000000000000000000000000001", "0000000000000000000000000000000000000000000000000000000000000000001", 5},
+ // test fallback to IndexRune
+ {"oxoxoxoxoxoxoxoxoxoxox☺", "☺", 22},
+ // invalid UTF-8 byte sequence (must be longer than bytealg.MaxBruteForce to
+ // test that we don't use IndexRune)
+ {"xx0123456789012345678901234567890123456789012345678901234567890120123456789012345678901234567890123456xxx\xed\x9f\xc0", "\xed\x9f\xc0", 105},
}
var lastIndexTests = []BinOpTest{
{"some_text=some_value", '=', 9},
{"☺a", 'a', 3},
{"a☻☺b", '☺', 4},
+ {"𠀳𠀗𠀾𠁄𠀧𠁆𠁂𠀫𠀖𠀪𠀲𠀴𠁀𠀨𠀿", '𠀿', 56},
+
+ // 2 bytes
+ {"ӆ", 'ӆ', 0},
+ {"a", 'ӆ', -1},
+ {" ӆ", 'ӆ', 2},
+ {" a", 'ӆ', -1},
+ {strings.Repeat("ц", 64) + "ӆ", 'ӆ', 128}, // test cutover
+ {strings.Repeat("ц", 64), 'ӆ', -1},
+
+ // 3 bytes
+ {"Ꚁ", 'Ꚁ', 0},
+ {"a", 'Ꚁ', -1},
+ {" Ꚁ", 'Ꚁ', 2},
+ {" a", 'Ꚁ', -1},
+ {strings.Repeat("Ꙁ", 64) + "Ꚁ", 'Ꚁ', 192}, // test cutover
+ {strings.Repeat("Ꙁ", 64) + "Ꚁ", '䚀', -1}, // 'Ꚁ' and '䚀' share the same last two bytes
+
+ // 4 bytes
+ {"𡌀", '𡌀', 0},
+ {"a", '𡌀', -1},
+ {" 𡌀", '𡌀', 2},
+ {" a", '𡌀', -1},
+ {strings.Repeat("𡋀", 64) + "𡌀", '𡌀', 256}, // test cutover
+ {strings.Repeat("𡋀", 64) + "𡌀", '𣌀', -1}, // '𡌀' and '𣌀' share the same last two bytes
// RuneError should match any invalid UTF-8 byte sequence.
{"�", '�', 0},
{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", -1, -1},
{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", 0xD800, -1}, // Surrogate pair
{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", utf8.MaxRune + 1, -1},
+
+ // Test the cutover to to bytealg.Index when it is triggered in
+ // the middle of rune that contains consecutive runs of equal bytes.
+ {"aaaaaKKKK\U000bc104", '\U000bc104', 17}, // cutover: (n + 16) / 8
+ {"aaaaaKKKK鄄", '鄄', 17},
+ {"aaKKKKKa\U000bc104", '\U000bc104', 18}, // cutover: 4 + n>>4
+ {"aaKKKKKa鄄", '鄄', 18},
}
for _, tt := range tests {
if got := IndexRune([]byte(tt.in), tt.rune); got != tt.want {
benchBytes(b, indexSizes, bmIndexRuneASCII(IndexRune))
}
+func BenchmarkIndexRuneUnicode(b *testing.B) {
+ b.Run("Latin", func(b *testing.B) {
+ // Latin is mostly 1, 2, 3 byte runes.
+ benchBytes(b, indexSizes, bmIndexRuneUnicode(unicode.Latin, 'é'))
+ })
+ b.Run("Cyrillic", func(b *testing.B) {
+ // Cyrillic is mostly 2 and 3 byte runes.
+ benchBytes(b, indexSizes, bmIndexRuneUnicode(unicode.Cyrillic, 'Ꙁ'))
+ })
+ b.Run("Han", func(b *testing.B) {
+ // Han consists only of 3 and 4 byte runes.
+ benchBytes(b, indexSizes, bmIndexRuneUnicode(unicode.Han, '𠀿'))
+ })
+}
+
func bmIndexRuneASCII(index func([]byte, rune) int) func(b *testing.B, n int) {
return func(b *testing.B, n int) {
buf := bmbuf[0:n]
}
}
+func bmIndexRuneUnicode(rt *unicode.RangeTable, needle rune) func(b *testing.B, n int) {
+ var rs []rune
+ for _, r16 := range rt.R16 {
+ for r := rune(r16.Lo); r <= rune(r16.Hi); r += rune(r16.Stride) {
+ if r != needle {
+ rs = append(rs, rune(r))
+ }
+ }
+ }
+ for _, r32 := range rt.R32 {
+ for r := rune(r32.Lo); r <= rune(r32.Hi); r += rune(r32.Stride) {
+ if r != needle {
+ rs = append(rs, rune(r))
+ }
+ }
+ }
+ // Shuffle the runes so that they are not in descending order.
+ // The sort is deterministic since this is used for benchmarks,
+ // which need to be repeatable.
+ rr := rand.New(rand.NewSource(1))
+ rr.Shuffle(len(rs), func(i, j int) {
+ rs[i], rs[j] = rs[j], rs[i]
+ })
+ uchars := string(rs)
+
+ return func(b *testing.B, n int) {
+ buf := bmbuf[0:n]
+ o := copy(buf, uchars)
+ for o < len(buf) {
+ o += copy(buf[o:], uchars)
+ }
+
+ // Make space for the needle rune at the end of buf.
+ m := utf8.RuneLen(needle)
+ for o := m; o > 0; {
+ _, sz := utf8.DecodeLastRune(buf)
+ copy(buf[len(buf)-sz:], "\x00\x00\x00\x00")
+ buf = buf[:len(buf)-sz]
+ o -= sz
+ }
+ buf = utf8.AppendRune(buf[:n-m], needle)
+
+ n -= m // adjust for rune len
+ for i := 0; i < b.N; i++ {
+ j := IndexRune(buf, needle)
+ if j != n {
+ b.Fatal("bad index", j)
+ }
+ }
+ for i := range buf {
+ buf[i] = '\x00'
+ }
+ }
+}
+
func BenchmarkEqual(b *testing.B) {
b.Run("0", func(b *testing.B) {
var buf [4]byte
var benchInputHard = makeBenchInputHard()
func benchmarkIndexHard(b *testing.B, sep []byte) {
+ n := Index(benchInputHard, sep)
+ if n < 0 {
+ n = len(benchInputHard)
+ }
+ b.SetBytes(int64(n))
for i := 0; i < b.N; i++ {
Index(benchInputHard, sep)
}
// If r is [utf8.RuneError], it returns the first instance of any
// invalid UTF-8 byte sequence.
func IndexRune(s string, r rune) int {
+ const haveFastIndex = bytealg.MaxBruteForce > 0
switch {
case 0 <= r && r < utf8.RuneSelf:
return IndexByte(s, byte(r))
case !utf8.ValidRune(r):
return -1
default:
- return Index(s, string(r))
+ // Search for rune r using the last byte of its UTF-8 encoded form.
+ // The distribution of the last byte is more uniform compared to the
+ // first byte which has a 78% chance of being [240, 243, 244].
+ rs := string(r)
+ last := len(rs) - 1
+ i := last
+ fails := 0
+ for i < len(s) {
+ if s[i] != rs[last] {
+ o := IndexByte(s[i+1:], rs[last])
+ if o < 0 {
+ return -1
+ }
+ i += o + 1
+ }
+ // Step backwards comparing bytes.
+ for j := 1; j < len(rs); j++ {
+ if s[i-j] != rs[last-j] {
+ goto next
+ }
+ }
+ return i - last
+ next:
+ fails++
+ i++
+ if (haveFastIndex && fails > bytealg.Cutover(i)) && i < len(s) ||
+ (!haveFastIndex && fails >= 4+i>>4 && i < len(s)) {
+ goto fallback
+ }
+ }
+ return -1
+
+ fallback:
+ // see comment in ../bytes/bytes.go
+ if haveFastIndex {
+ if j := bytealg.IndexString(s[i-last:], string(r)); j >= 0 {
+ return i + j - last
+ }
+ } else {
+ c0 := rs[last]
+ c1 := rs[last-1]
+ loop:
+ for ; i < len(s); i++ {
+ if s[i] == c0 && s[i-1] == c1 {
+ for k := 2; k < len(rs); k++ {
+ if s[i-k] != rs[last-k] {
+ continue loop
+ }
+ }
+ return i - last
+ }
+ }
+ }
+ return -1
}
}
// test fallback to Rabin-Karp.
{"oxoxoxoxoxoxoxoxoxoxoxoy", "oy", 22},
{"oxoxoxoxoxoxoxoxoxoxoxox", "oy", -1},
+ // test fallback to IndexRune
+ {"oxoxoxoxoxoxoxoxoxoxox☺", "☺", 22},
+ // invalid UTF-8 byte sequence (must be longer than bytealg.MaxBruteForce to
+ // test that we don't use IndexRune)
+ {"xx0123456789012345678901234567890123456789012345678901234567890120123456789012345678901234567890123456xxx\xed\x9f\xc0", "\xed\x9f\xc0", 105},
}
var lastIndexTests = []IndexTest{
{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", -1, -1},
{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", 0xD800, -1}, // Surrogate pair
{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", utf8.MaxRune + 1, -1},
+
+ // 2 bytes
+ {"ӆ", 'ӆ', 0},
+ {"a", 'ӆ', -1},
+ {" ӆ", 'ӆ', 2},
+ {" a", 'ӆ', -1},
+ {Repeat("ц", 64) + "ӆ", 'ӆ', 128}, // test cutover
+ {Repeat("Ꙁ", 64) + "Ꚁ", '䚀', -1}, // 'Ꚁ' and '䚀' share the same last two bytes
+
+ // 3 bytes
+ {"Ꚁ", 'Ꚁ', 0},
+ {"a", 'Ꚁ', -1},
+ {" Ꚁ", 'Ꚁ', 2},
+ {" a", 'Ꚁ', -1},
+ {Repeat("Ꙁ", 64) + "Ꚁ", 'Ꚁ', 192}, // test cutover
+ {Repeat("𡋀", 64) + "𡌀", '𣌀', -1}, // '𡌀' and '𣌀' share the same last two bytes
+
+ // 4 bytes
+ {"𡌀", '𡌀', 0},
+ {"a", '𡌀', -1},
+ {" 𡌀", '𡌀', 2},
+ {" a", '𡌀', -1},
+ {Repeat("𡋀", 64) + "𡌀", '𡌀', 256}, // test cutover
+ {Repeat("𡋀", 64), '𡌀', -1},
+
+ // Test the cutover to to bytealg.IndexString when it is triggered in
+ // the middle of rune that contains consecutive runs of equal bytes.
+ {"aaaaaKKKK\U000bc104", '\U000bc104', 17}, // cutover: (n + 16) / 8
+ {"aaaaaKKKK鄄", '鄄', 17},
+ {"aaKKKKKa\U000bc104", '\U000bc104', 18}, // cutover: 4 + n>>4
+ {"aaKKKKKa鄄", '鄄', 18},
}
for _, tt := range tests {
if got := IndexRune(tt.in, tt.rune); got != tt.want {
}
}
- haystack := "test世界"
+ // Make sure we trigger the cutover and string(rune) conversion.
+ haystack := "test" + Repeat("𡋀", 32) + "𡌀"
allocs := testing.AllocsPerRun(1000, func() {
if i := IndexRune(haystack, 's'); i != 2 {
t.Fatalf("'s' at %d; want 2", i)
}
- if i := IndexRune(haystack, '世'); i != 4 {
- t.Fatalf("'世' at %d; want 4", i)
+ if i := IndexRune(haystack, '𡌀'); i != 132 {
+ t.Fatalf("'𡌀' at %d; want 4", i)
}
})
if allocs != 0 && testing.CoverMode() == "" {