bytes, strings: speed up TrimSpace 4-5x for common ASCII cases

author Ben Hoyt <benhoyt@gmail.com>

Thu, 6 Dec 2018 13:53:29 +0000 (08:53 -0500)

committer Brad Fitzpatrick <bradfitz@golang.org>

Tue, 12 Mar 2019 15:52:17 +0000 (15:52 +0000)
author Ben Hoyt <benhoyt@gmail.com>
Thu, 6 Dec 2018 13:53:29 +0000 (08:53 -0500)
committer Brad Fitzpatrick <bradfitz@golang.org>
Tue, 12 Mar 2019 15:52:17 +0000 (15:52 +0000)
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go

index 6fcebe65930061113b96e50b2ff98f9d9045ea99..08fc14d837fb6ab6168a98a3d927e9fc3675d5c7 100644 (file)
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -759,7 +759,36 @@ func TrimRight(s []byte, cutset string) []byte {
  // TrimSpace returns a subslice of s by slicing off all leading and
  // trailing white space, as defined by Unicode.
  func TrimSpace(s []byte) []byte {
-       return TrimFunc(s, unicode.IsSpace)
+       // Fast path for ASCII: look for the first ASCII non-space byte
+       start := 0
+       for ; start < len(s); start++ {
+               c := s[start]
+               if c >= utf8.RuneSelf {
+                       // If we run into a non-ASCII byte, fall back to the
+                       // slower unicode-aware method on the remaining bytes
+                       return TrimFunc(s[start:], unicode.IsSpace)
+               }
+               if asciiSpace[c] == 0 {
+                       break
+               }
+       }
+
+       // Now look for the first ASCII non-space byte from the end
+       stop := len(s)
+       for ; stop > start; stop-- {
+               c := s[stop-1]
+               if c >= utf8.RuneSelf {
+                       return TrimFunc(s[start:stop], unicode.IsSpace)
+               }
+               if asciiSpace[c] == 0 {
+                       break
+               }
+       }
+
+       // At this point s[start:stop] starts and ends with an ASCII
+       // non-space bytes, so we're done. Non-ASCII cases have already
+       // been handled above.
+       return s[start:stop]
  }
  
  // Runes interprets s as a sequence of UTF-8-encoded code points.
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go

index 80a54f6118b199046382c7f2540fd37b746956e0..98ba95009d40ea8bd64529adca377d1c6649ddfe 100644 (file)
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -1617,9 +1617,21 @@ func BenchmarkFieldsFunc(b *testing.B) {
  }
  
  func BenchmarkTrimSpace(b *testing.B) {
-       s := []byte("  Some text.  \n")
-       for i := 0; i < b.N; i++ {
-               TrimSpace(s)
+       tests := []struct {
+               name  string
+               input []byte
+       }{
+               {"NoTrim", []byte("typical")},
+               {"ASCII", []byte("  foo bar  ")},
+               {"SomeNonASCII", []byte("    \u2000\t\r\n x\t\t\r\r\ny\n \u3000    ")},
+               {"JustNonASCII", []byte("\u2000\u2000\u2000☺☺☺☺\u3000\u3000\u3000")},
+       }
+       for _, test := range tests {
+               b.Run(test.name, func(b *testing.B) {
+                       for i := 0; i < b.N; i++ {
+                               TrimSpace(test.input)
+                       }
+               })
         }
  }
  
diff --git a/src/strings/strings.go b/src/strings/strings.go

index a98f5d8ff13946380086722aa9b45939b0498fcd..e14fffb2b86bcd6d9052f6153c685b59f7aaee5b 100644 (file)
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -818,7 +818,36 @@ func TrimRight(s string, cutset string) string {
  // TrimSpace returns a slice of the string s, with all leading
  // and trailing white space removed, as defined by Unicode.
  func TrimSpace(s string) string {
-       return TrimFunc(s, unicode.IsSpace)
+       // Fast path for ASCII: look for the first ASCII non-space byte
+       start := 0
+       for ; start < len(s); start++ {
+               c := s[start]
+               if c >= utf8.RuneSelf {
+                       // If we run into a non-ASCII byte, fall back to the
+                       // slower unicode-aware method on the remaining bytes
+                       return TrimFunc(s[start:], unicode.IsSpace)
+               }
+               if asciiSpace[c] == 0 {
+                       break
+               }
+       }
+
+       // Now look for the first ASCII non-space byte from the end
+       stop := len(s)
+       for ; stop > start; stop-- {
+               c := s[stop-1]
+               if c >= utf8.RuneSelf {
+                       return TrimFunc(s[start:stop], unicode.IsSpace)
+               }
+               if asciiSpace[c] == 0 {
+                       break
+               }
+       }
+
+       // At this point s[start:stop] starts and ends with an ASCII
+       // non-space bytes, so we're done. Non-ASCII cases have already
+       // been handled above.
+       return s[start:stop]
  }
  
  // TrimPrefix returns s without the provided leading prefix string.
diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go

index eee2dd55dfd09eb886b8672cc3cc0ee6b1318462..500671aca474e5da1da9b6139456086dc3d7c80a 100644 (file)
--- a/src/strings/strings_test.go
+++ b/src/strings/strings_test.go
@@ -1731,3 +1731,19 @@ func BenchmarkJoin(b *testing.B) {
                 })
         }
  }
+
+func BenchmarkTrimSpace(b *testing.B) {
+       tests := []struct{ name, input string }{
+               {"NoTrim", "typical"},
+               {"ASCII", "  foo bar  "},
+               {"SomeNonASCII", "    \u2000\t\r\n x\t\t\r\r\ny\n \u3000    "},
+               {"JustNonASCII", "\u2000\u2000\u2000☺☺☺☺\u3000\u3000\u3000"},
+       }
+       for _, test := range tests {
+               b.Run(test.name, func(b *testing.B) {
+                       for i := 0; i < b.N; i++ {
+                               TrimSpace(test.input)
+                       }
+               })
+       }
+}
author	Ben Hoyt <benhoyt@gmail.com>
	Thu, 6 Dec 2018 13:53:29 +0000 (08:53 -0500)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Tue, 12 Mar 2019 15:52:17 +0000 (15:52 +0000)
src/bytes/bytes.go		patch \| blob \| history
src/bytes/bytes_test.go		patch \| blob \| history
src/strings/strings.go		patch \| blob \| history
src/strings/strings_test.go		patch \| blob \| history