go/scanner: optimize scanIdentifier

author Rob Findley <rfindley@google.com>

Thu, 1 Apr 2021 16:23:22 +0000 (12:23 -0400)

committer Robert Findley <rfindley@google.com>

Tue, 27 Apr 2021 17:05:58 +0000 (17:05 +0000)
author Rob Findley <rfindley@google.com>
Thu, 1 Apr 2021 16:23:22 +0000 (12:23 -0400)
committer Robert Findley <rfindley@google.com>
Tue, 27 Apr 2021 17:05:58 +0000 (17:05 +0000)
diff --git a/src/go/scanner/scanner.go b/src/go/scanner/scanner.go

index 00fe2dc0b19fd0b66bbe3fb48d27d1b0d27b972f..299c03fc9710f4f700ffb321d8ea0b52a2d13993 100644 (file)
--- a/src/go/scanner/scanner.go
+++ b/src/go/scanner/scanner.go
@@ -48,11 +48,16 @@ type Scanner struct {
         ErrorCount int // number of errors encountered
  }
  
-const bom = 0xFEFF // byte order mark, only permitted as very first character
+const (
+       bom = 0xFEFF // byte order mark, only permitted as very first character
+       eof = -1     // end of file
+)
  
  // Read the next Unicode char into s.ch.
  // s.ch < 0 means end-of-file.
  //
+// For optimization, there is some overlap between this method and
+// s.scanIdentifier.
  func (s *Scanner) next() {
         if s.rdOffset < len(s.src) {
                 s.offset = s.rdOffset
@@ -81,7 +86,7 @@ func (s *Scanner) next() {
                         s.lineOffset = s.offset
                         s.file.AddLine(s.offset)
                 }
-               s.ch = -1 // eof
+               s.ch = eof
         }
  }
  
@@ -347,11 +352,53 @@ func isDigit(ch rune) bool {
         return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
  }
  
+// scanIdentifier reads the string of valid identifier characters at s.offset.
+// It must only be called when s.ch is known to be a valid letter.
+//
+// Be careful when making changes to this function: it is optimized and affects
+// scanning performance significantly.
  func (s *Scanner) scanIdentifier() string {
         offs := s.offset
-       for isLetter(s.ch) || isDigit(s.ch) {
+
+       // Optimize for the common case of an ASCII identifier.
+       //
+       // Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
+       // avoids conversions to runes.
+       //
+       // In case we encounter a non-ASCII character, fall back on the slower path
+       // of calling into s.next().
+       for rdOffset, b := range s.src[s.rdOffset:] {
+               if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
+                       // Avoid assigning a rune for the common case of an ascii character.
+                       continue
+               }
+               s.rdOffset += rdOffset
+               if b < utf8.RuneSelf {
+                       // Optimization: we've encountered an ASCII character that's not a letter
+                       // or number. Avoid the call into s.next() and corresponding set up.
+                       //
+                       // Note that s.next() does some line accounting if s.ch is '\n', so this
+                       // shortcut is only possible because we know that the preceding character
+                       // is not '\n'.
+                       s.ch = rune(b)
+                       s.offset = s.rdOffset
+                       s.rdOffset++
+                       goto exit
+               }
+               // We know that the preceding character is valid for an identifier because
+               // scanIdentifier is only called when s.ch is a letter, so calling s.next()
+               // at s.rdOffset resets the scanner state.
                 s.next()
+               for isLetter(s.ch) || isDigit(s.ch) {
+                       s.next()
+               }
+               goto exit
         }
+       s.offset = len(s.src)
+       s.rdOffset = len(s.src)
+       s.ch = eof
+
+exit:
         return string(s.src[offs:s.offset])
  }
author	Rob Findley <rfindley@google.com>
	Thu, 1 Apr 2021 16:23:22 +0000 (12:23 -0400)
committer	Robert Findley <rfindley@google.com>
	Tue, 27 Apr 2021 17:05:58 +0000 (17:05 +0000)