strings: speed up Fields

author Martin Möhrmann <moehrmann@google.com>

Mon, 6 Mar 2017 08:34:39 +0000 (09:34 +0100)

committer Martin Möhrmann <moehrmann@google.com>

Tue, 4 Apr 2017 06:26:11 +0000 (06:26 +0000)
author Martin Möhrmann <moehrmann@google.com>
Mon, 6 Mar 2017 08:34:39 +0000 (09:34 +0100)
committer Martin Möhrmann <moehrmann@google.com>
Tue, 4 Apr 2017 06:26:11 +0000 (06:26 +0000)
diff --git a/src/strings/strings.go b/src/strings/strings.go

index 2650fb057ca5977bb38ef8e6e6c85c566fadb522..a01eb698c4393351aed03cb151cf926836d99330 100644 (file)
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -290,11 +290,118 @@ func SplitAfter(s, sep string) []string {
         return genSplit(s, sep, len(sep), -1)
  }
  
+var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
+
  // Fields splits the string s around each instance of one or more consecutive white space
  // characters, as defined by unicode.IsSpace, returning an array of substrings of s or an
  // empty list if s contains only white space.
  func Fields(s string) []string {
-       return FieldsFunc(s, unicode.IsSpace)
+       // First count the fields.
+       // This is an exact count if s is ASCII, otherwise it is an approximation.
+       n := 0
+       wasSpace := 1
+       // setBits is used to track which bits are set in the bytes of s.
+       setBits := uint8(0)
+       for i := 0; i < len(s); i++ {
+               r := s[i]
+               setBits |= r
+               isSpace := int(asciiSpace[r])
+               n += wasSpace & ^isSpace
+               wasSpace = isSpace
+       }
+
+       if setBits < utf8.RuneSelf { // ASCII fast path
+               a := make([]string, n)
+               na := 0
+               fieldStart := 0
+               i := 0
+               // Skip spaces in the front of the input.
+               for i < len(s) && asciiSpace[s[i]] != 0 {
+                       i++
+               }
+               fieldStart = i
+               for i < len(s) {
+                       if asciiSpace[s[i]] == 0 {
+                               i++
+                               continue
+                       }
+                       a[na] = s[fieldStart:i]
+                       na++
+                       i++
+                       // Skip spaces in between fields.
+                       for i < len(s) && asciiSpace[s[i]] != 0 {
+                               i++
+                       }
+                       fieldStart = i
+               }
+               if fieldStart < len(s) { // Last field might end at EOF.
+                       a[na] = s[fieldStart:]
+               }
+               return a
+       }
+
+       // Some runes in the input string are not ASCII.
+       // Same general approach as in the ASCII path but
+       // uses DecodeRuneInString and unicode.IsSpace if
+       // a non-ASCII rune needs to be decoded and checked
+       // if it corresponds to a space.
+       a := make([]string, 0, n)
+       fieldStart := 0
+       i := 0
+       // Skip spaces in the front of the input.
+       for i < len(s) {
+               if c := s[i]; c < utf8.RuneSelf {
+                       if asciiSpace[c] == 0 {
+                               break
+                       }
+                       i++
+               } else {
+                       r, w := utf8.DecodeRuneInString(s[i:])
+                       if !unicode.IsSpace(r) {
+                               break
+                       }
+                       i += w
+               }
+       }
+       fieldStart = i
+       for i < len(s) {
+               if c := s[i]; c < utf8.RuneSelf {
+                       if asciiSpace[c] == 0 {
+                               i++
+                               continue
+                       }
+                       a = append(a, s[fieldStart:i])
+                       i++
+               } else {
+                       r, w := utf8.DecodeRuneInString(s[i:])
+                       if !unicode.IsSpace(r) {
+                               i += w
+                               continue
+                       }
+                       a = append(a, s[fieldStart:i])
+                       i += w
+               }
+               // Skip spaces in between fields.
+               for i < len(s) {
+                       if c := s[i]; c < utf8.RuneSelf {
+                               if asciiSpace[c] == 0 {
+                                       break
+                               }
+                               i++
+                       } else {
+                               r, w := utf8.DecodeRuneInString(s[i:])
+                               if !unicode.IsSpace(r) {
+                                       break
+                               }
+                               i += w
+                       }
+               }
+               fieldStart = i
+       }
+       if fieldStart < len(s) { // Last field might end at EOF.
+               a = append(a, s[fieldStart:])
+       }
+       return a
  }
  
  // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go

index 97041eb9aca71cd31d192b857c7e4a8668868901..58314a68689140ca8aeb8ddf41ec4f8c5a35e1f4 100644 (file)
--- a/src/strings/strings_test.go
+++ b/src/strings/strings_test.go
@@ -452,6 +452,7 @@ var fieldstests = []FieldsTest{
         {"", []string{}},
         {" ", []string{}},
         {" \t ", []string{}},
+       {"\u2000", []string{}},
         {"  abc  ", []string{"abc"}},
         {"1 2 3 4", []string{"1", "2", "3", "4"}},
         {"1  2  3  4", []string{"1", "2", "3", "4"}},
@@ -459,6 +460,9 @@ var fieldstests = []FieldsTest{
         {"1\u20002\u20013\u20024", []string{"1", "2", "3", "4"}},
         {"\u2000\u2001\u2002", []string{}},
         {"\n™\t™\n", []string{"™", "™"}},
+       {"\n\u20001™2\u2000 \u2001 ™", []string{"1™2", "™"}},
+       {"\n1\uFFFD \uFFFD2\u20003\uFFFD4", []string{"1\uFFFD", "\uFFFD2", "3\uFFFD4"}},
+       {"1\xFF\u2000\xFF2\xFF \xFF", []string{"1\xFF", "\xFF2\xFF", "\xFF"}},
         {faces, []string{faces}},
  }
  
@@ -1473,19 +1477,55 @@ var makeFieldsInput = func() string {
         return string(x)
  }
  
-var fieldsInput = makeFieldsInput()
+var makeFieldsInputASCII = func() string {
+       x := make([]byte, 1<<20)
+       // Input is ~10% space, rest ASCII non-space.
+       for i := range x {
+               if rand.Intn(10) == 0 {
+                       x[i] = ' '
+               } else {
+                       x[i] = 'x'
+               }
+       }
+       return string(x)
+}
+
+var stringdata = []struct{ name, data string }{
+       {"ASCII", makeFieldsInputASCII()},
+       {"Mixed", makeFieldsInput()},
+}
  
  func BenchmarkFields(b *testing.B) {
-       b.SetBytes(int64(len(fieldsInput)))
-       for i := 0; i < b.N; i++ {
-               Fields(fieldsInput)
+       for _, sd := range stringdata {
+               b.Run(sd.name, func(b *testing.B) {
+                       for j := 1 << 4; j <= 1<<20; j <<= 4 {
+                               b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
+                                       b.ReportAllocs()
+                                       b.SetBytes(int64(j))
+                                       data := sd.data[:j]
+                                       for i := 0; i < b.N; i++ {
+                                               Fields(data)
+                                       }
+                               })
+                       }
+               })
         }
  }
  
  func BenchmarkFieldsFunc(b *testing.B) {
-       b.SetBytes(int64(len(fieldsInput)))
-       for i := 0; i < b.N; i++ {
-               FieldsFunc(fieldsInput, unicode.IsSpace)
+       for _, sd := range stringdata {
+               b.Run(sd.name, func(b *testing.B) {
+                       for j := 1 << 4; j <= 1<<20; j <<= 4 {
+                               b.Run(fmt.Sprintf("%d", j), func(b *testing.B) {
+                                       b.ReportAllocs()
+                                       b.SetBytes(int64(j))
+                                       data := sd.data[:j]
+                                       for i := 0; i < b.N; i++ {
+                                               FieldsFunc(data, unicode.IsSpace)
+                                       }
+                               })
+                       }
+               })
         }
  }
author	Martin Möhrmann <moehrmann@google.com>
	Mon, 6 Mar 2017 08:34:39 +0000 (09:34 +0100)
committer	Martin Möhrmann <moehrmann@google.com>
	Tue, 4 Apr 2017 06:26:11 +0000 (06:26 +0000)
src/strings/strings.go		patch \| blob \| history
src/strings/strings_test.go		patch \| blob \| history