return genSplit(s, sep, len(sep), -1)
}
+var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
+
// Fields splits the string s around each instance of one or more consecutive white space
// characters, as defined by unicode.IsSpace, returning an array of substrings of s or an
// empty list if s contains only white space.
func Fields(s string) []string {
- return FieldsFunc(s, unicode.IsSpace)
+ // First count the fields.
+ // This is an exact count if s is ASCII, otherwise it is an approximation.
+ n := 0
+ wasSpace := 1
+ // setBits is used to track which bits are set in the bytes of s.
+ setBits := uint8(0)
+ for i := 0; i < len(s); i++ {
+ r := s[i]
+ setBits |= r
+ isSpace := int(asciiSpace[r])
+ n += wasSpace & ^isSpace
+ wasSpace = isSpace
+ }
+
+ if setBits < utf8.RuneSelf { // ASCII fast path
+ a := make([]string, n)
+ na := 0
+ fieldStart := 0
+ i := 0
+ // Skip spaces in the front of the input.
+ for i < len(s) && asciiSpace[s[i]] != 0 {
+ i++
+ }
+ fieldStart = i
+ for i < len(s) {
+ if asciiSpace[s[i]] == 0 {
+ i++
+ continue
+ }
+ a[na] = s[fieldStart:i]
+ na++
+ i++
+ // Skip spaces in between fields.
+ for i < len(s) && asciiSpace[s[i]] != 0 {
+ i++
+ }
+ fieldStart = i
+ }
+ if fieldStart < len(s) { // Last field might end at EOF.
+ a[na] = s[fieldStart:]
+ }
+ return a
+ }
+
+ // Some runes in the input string are not ASCII.
+ // Same general approach as in the ASCII path but
+ // uses DecodeRuneInString and unicode.IsSpace if
+ // a non-ASCII rune needs to be decoded and checked
+ // if it corresponds to a space.
+ a := make([]string, 0, n)
+ fieldStart := 0
+ i := 0
+ // Skip spaces in the front of the input.
+ for i < len(s) {
+ if c := s[i]; c < utf8.RuneSelf {
+ if asciiSpace[c] == 0 {
+ break
+ }
+ i++
+ } else {
+ r, w := utf8.DecodeRuneInString(s[i:])
+ if !unicode.IsSpace(r) {
+ break
+ }
+ i += w
+ }
+ }
+ fieldStart = i
+ for i < len(s) {
+ if c := s[i]; c < utf8.RuneSelf {
+ if asciiSpace[c] == 0 {
+ i++
+ continue
+ }
+ a = append(a, s[fieldStart:i])
+ i++
+ } else {
+ r, w := utf8.DecodeRuneInString(s[i:])
+ if !unicode.IsSpace(r) {
+ i += w
+ continue
+ }
+ a = append(a, s[fieldStart:i])
+ i += w
+ }
+ // Skip spaces in between fields.
+ for i < len(s) {
+ if c := s[i]; c < utf8.RuneSelf {
+ if asciiSpace[c] == 0 {
+ break
+ }
+ i++
+ } else {
+ r, w := utf8.DecodeRuneInString(s[i:])
+ if !unicode.IsSpace(r) {
+ break
+ }
+ i += w
+ }
+ }
+ fieldStart = i
+ }
+ if fieldStart < len(s) { // Last field might end at EOF.
+ a = append(a, s[fieldStart:])
+ }
+ return a
}
// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
{"", []string{}},
{" ", []string{}},
{" \t ", []string{}},
+ {"\u2000", []string{}},
{" abc ", []string{"abc"}},
{"1 2 3 4", []string{"1", "2", "3", "4"}},
{"1 2 3 4", []string{"1", "2", "3", "4"}},
{"1\u20002\u20013\u20024", []string{"1", "2", "3", "4"}},
{"\u2000\u2001\u2002", []string{}},
{"\n™\t™\n", []string{"™", "™"}},
+ {"\n\u20001™2\u2000 \u2001 ™", []string{"1™2", "™"}},
+ {"\n1\uFFFD \uFFFD2\u20003\uFFFD4", []string{"1\uFFFD", "\uFFFD2", "3\uFFFD4"}},
+ {"1\xFF\u2000\xFF2\xFF \xFF", []string{"1\xFF", "\xFF2\xFF", "\xFF"}},
{faces, []string{faces}},
}
return string(x)
}
-var fieldsInput = makeFieldsInput()
+var makeFieldsInputASCII = func() string {
+ x := make([]byte, 1<<20)
+ // Input is ~10% space, rest ASCII non-space.
+ for i := range x {
+ if rand.Intn(10) == 0 {
+ x[i] = ' '
+ } else {
+ x[i] = 'x'
+ }
+ }
+ return string(x)
+}
+
+var stringdata = []struct{ name, data string }{
+ {"ASCII", makeFieldsInputASCII()},
+ {"Mixed", makeFieldsInput()},
+}
func BenchmarkFields(b *testing.B) {
- b.SetBytes(int64(len(fieldsInput)))
- for i := 0; i < b.N; i++ {
- Fields(fieldsInput)
+ for _, sd := range stringdata {
+ b.Run(sd.name, func(b *testing.B) {
+ for j := 1 << 4; j <= 1<<20; j <<= 4 {
+ b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
+ b.ReportAllocs()
+ b.SetBytes(int64(j))
+ data := sd.data[:j]
+ for i := 0; i < b.N; i++ {
+ Fields(data)
+ }
+ })
+ }
+ })
}
}
func BenchmarkFieldsFunc(b *testing.B) {
- b.SetBytes(int64(len(fieldsInput)))
- for i := 0; i < b.N; i++ {
- FieldsFunc(fieldsInput, unicode.IsSpace)
+ for _, sd := range stringdata {
+ b.Run(sd.name, func(b *testing.B) {
+ for j := 1 << 4; j <= 1<<20; j <<= 4 {
+ b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
+ b.ReportAllocs()
+ b.SetBytes(int64(j))
+ data := sd.data[:j]
+ for i := 0; i < b.N; i++ {
+ FieldsFunc(data, unicode.IsSpace)
+ }
+ })
+ }
+ })
}
}