From: Rob Findley <rfindley@google.com>
Date: Thu, 1 Apr 2021 16:23:22 +0000 (-0400)
Subject: go/scanner: optimize scanIdentifier
X-Git-Tag: go1.17beta1~422
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=cde6a675bc;p=gostls13.git

go/scanner: optimize scanIdentifier

While profiling parsing, I noticed that scanIdentifier was extremely
hot, and could be optimized: it is responsible for a significant
fraction of scanning and had a lot of unnecessary branching, bounds
checks, and function calls.

This CL implements some of those optimizations, while trying to strike a
balance between optimization and readability. It achieves this by
optimizing for the common case of ASCII identifiers, falling back on the
slower scan when encountering the first non-ASCII character.

Benchmark results:

name                               old time/op    new time/op    delta
Scan-12                              16.9µs ± 4%    15.8µs ± 5%   -6.92%  (p=0.000 n=20+18)
ScanFiles/go/types/expr.go-12         793µs ± 4%     672µs ± 6%  -15.23%  (p=0.000 n=20+20)
ScanFiles/go/parser/parser.go-12     1.08ms ± 6%    0.90ms ± 4%  -16.68%  (p=0.000 n=20+20)
ScanFiles/net/http/server.go-12      1.44ms ± 4%    1.23ms ± 5%  -14.58%  (p=0.000 n=18+20)
ScanFiles/go/scanner/errors.go-12    40.7µs ± 2%    32.6µs ± 3%  -20.01%  (p=0.000 n=19+20)

Change-Id: If78380004248e3ea75cfc78eb7f38f528124dced
Reviewed-on: https://go-review.googlesource.com/c/go/+/308611
Trust: Robert Findley <rfindley@google.com>
Trust: Robert Griesemer <gri@golang.org>
Run-TryBot: Robert Findley <rfindley@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Robert Griesemer <gri@golang.org>
---

diff --git a/src/go/scanner/scanner.go b/src/go/scanner/scanner.go
index 00fe2dc0b1..299c03fc97 100644
--- a/src/go/scanner/scanner.go
+++ b/src/go/scanner/scanner.go
@@ -48,11 +48,16 @@ type Scanner struct {
 	ErrorCount int // number of errors encountered
 }
 
-const bom = 0xFEFF // byte order mark, only permitted as very first character
+const (
+	bom = 0xFEFF // byte order mark, only permitted as very first character
+	eof = -1     // end of file
+)
 
 // Read the next Unicode char into s.ch.
 // s.ch < 0 means end-of-file.
 //
+// For optimization, there is some overlap between this method and
+// s.scanIdentifier.
 func (s *Scanner) next() {
 	if s.rdOffset < len(s.src) {
 		s.offset = s.rdOffset
@@ -81,7 +86,7 @@ func (s *Scanner) next() {
 			s.lineOffset = s.offset
 			s.file.AddLine(s.offset)
 		}
-		s.ch = -1 // eof
+		s.ch = eof
 	}
 }
 
@@ -347,11 +352,53 @@ func isDigit(ch rune) bool {
 	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
 }
 
+// scanIdentifier reads the string of valid identifier characters at s.offset.
+// It must only be called when s.ch is known to be a valid letter.
+//
+// Be careful when making changes to this function: it is optimized and affects
+// scanning performance significantly.
 func (s *Scanner) scanIdentifier() string {
 	offs := s.offset
-	for isLetter(s.ch) || isDigit(s.ch) {
+
+	// Optimize for the common case of an ASCII identifier.
+	//
+	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
+	// avoids conversions to runes.
+	//
+	// In case we encounter a non-ASCII character, fall back on the slower path
+	// of calling into s.next().
+	for rdOffset, b := range s.src[s.rdOffset:] {
+		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
+			// Avoid assigning a rune for the common case of an ascii character.
+			continue
+		}
+		s.rdOffset += rdOffset
+		if b < utf8.RuneSelf {
+			// Optimization: we've encountered an ASCII character that's not a letter
+			// or number. Avoid the call into s.next() and corresponding set up.
+			//
+			// Note that s.next() does some line accounting if s.ch is '\n', so this
+			// shortcut is only possible because we know that the preceding character
+			// is not '\n'.
+			s.ch = rune(b)
+			s.offset = s.rdOffset
+			s.rdOffset++
+			goto exit
+		}
+		// We know that the preceding character is valid for an identifier because
+		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
+		// at s.rdOffset resets the scanner state.
 		s.next()
+		for isLetter(s.ch) || isDigit(s.ch) {
+			s.next()
+		}
+		goto exit
 	}
+	s.offset = len(s.src)
+	s.rdOffset = len(s.src)
+	s.ch = eof
+
+exit:
 	return string(s.src[offs:s.offset])
 }