go/scanner: 17% faster scanning

author Robert Griesemer <gri@golang.org>

Wed, 11 Jan 2012 22:20:32 +0000 (14:20 -0800)

committer Robert Griesemer <gri@golang.org>

Wed, 11 Jan 2012 22:20:32 +0000 (14:20 -0800)
author Robert Griesemer <gri@golang.org>
Wed, 11 Jan 2012 22:20:32 +0000 (14:20 -0800)
committer Robert Griesemer <gri@golang.org>
Wed, 11 Jan 2012 22:20:32 +0000 (14:20 -0800)
diff --git a/src/cmd/cgo/gcc.go b/src/cmd/cgo/gcc.go

index 75ce1782a06d0976ef692c0a3189f2d6471b7fdd..486090e90efba91034359e49504cdf120bd78d3d 100644 (file)
--- a/src/cmd/cgo/gcc.go
+++ b/src/cmd/cgo/gcc.go
@@ -1374,7 +1374,7 @@ func (c *typeConv) Struct(dt *dwarf.StructType) (expr *ast.StructType, csyntax s
  
         if !*godefs && !*cdefs {
                 for cid, goid := range ident {
-                       if token.Lookup([]byte(goid)).IsKeyword() {
+                       if token.Lookup(goid).IsKeyword() {
                                 // Avoid keyword
                                 goid = "_" + goid
  
diff --git a/src/pkg/go/scanner/scanner.go b/src/pkg/go/scanner/scanner.go

index c5d83eba5868f92da207b5c79059900c19300d6d..59a796574f60bb21051984dcbd51c984c531bfa5 100644 (file)
--- a/src/pkg/go/scanner/scanner.go
+++ b/src/pkg/go/scanner/scanner.go
@@ -157,7 +157,7 @@ func (S *Scanner) interpretLineComment(text []byte) {
         }
  }
  
-func (S *Scanner) scanComment() {
+func (S *Scanner) scanComment() string {
         // initial '/' already consumed; S.ch == '/' || S.ch == '*'
         offs := S.offset - 1 // position of initial '/'
  
@@ -171,7 +171,7 @@ func (S *Scanner) scanComment() {
                         // comment starts at the beginning of the current line
                         S.interpretLineComment(S.src[offs:S.offset])
                 }
-               return
+               goto exit
         }
  
         /*-style comment */
@@ -181,11 +181,14 @@ func (S *Scanner) scanComment() {
                 S.next()
                 if ch == '*' && S.ch == '/' {
                         S.next()
-                       return
+                       goto exit
                 }
         }
  
         S.error(offs, "comment not terminated")
+
+exit:
+       return string(S.src[offs:S.offset])
  }
  
  func (S *Scanner) findLineEnd() bool {
@@ -240,12 +243,12 @@ func isDigit(ch rune) bool {
         return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
  }
  
-func (S *Scanner) scanIdentifier() token.Token {
+func (S *Scanner) scanIdentifier() string {
         offs := S.offset
         for isLetter(S.ch) || isDigit(S.ch) {
                 S.next()
         }
-       return token.Lookup(S.src[offs:S.offset])
+       return string(S.src[offs:S.offset])
  }
  
  func digitVal(ch rune) int {
@@ -266,11 +269,13 @@ func (S *Scanner) scanMantissa(base int) {
         }
  }
  
-func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
+func (S *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
         // digitVal(S.ch) < 10
+       offs := S.offset
         tok := token.INT
  
         if seenDecimalPoint {
+               offs--
                 tok = token.FLOAT
                 S.scanMantissa(10)
                 goto exponent
@@ -334,7 +339,7 @@ exponent:
         }
  
  exit:
-       return tok
+       return tok, string(S.src[offs:S.offset])
  }
  
  func (S *Scanner) scanEscape(quote rune) {
@@ -381,7 +386,7 @@ func (S *Scanner) scanEscape(quote rune) {
         }
  }
  
-func (S *Scanner) scanChar() {
+func (S *Scanner) scanChar() string {
         // '\'' opening already consumed
         offs := S.offset - 1
  
@@ -405,9 +410,11 @@ func (S *Scanner) scanChar() {
         if n != 1 {
                 S.error(offs, "illegal character literal")
         }
+
+       return string(S.src[offs:S.offset])
  }
  
-func (S *Scanner) scanString() {
+func (S *Scanner) scanString() string {
         // '"' opening already consumed
         offs := S.offset - 1
  
@@ -424,12 +431,27 @@ func (S *Scanner) scanString() {
         }
  
         S.next()
+
+       return string(S.src[offs:S.offset])
+}
+
+func stripCR(b []byte) []byte {
+       c := make([]byte, len(b))
+       i := 0
+       for _, ch := range b {
+               if ch != '\r' {
+                       c[i] = ch
+                       i++
+               }
+       }
+       return c[:i]
  }
  
-func (S *Scanner) scanRawString() (hasCR bool) {
+func (S *Scanner) scanRawString() string {
         // '`' opening already consumed
         offs := S.offset - 1
  
+       hasCR := false
         for S.ch != '`' {
                 ch := S.ch
                 S.next()
@@ -443,7 +465,13 @@ func (S *Scanner) scanRawString() (hasCR bool) {
         }
  
         S.next()
-       return
+
+       lit := S.src[offs:S.offset]
+       if hasCR {
+               lit = stripCR(lit)
+       }
+
+       return string(lit)
  }
  
  func (S *Scanner) skipWhitespace() {
@@ -494,27 +522,24 @@ func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Tok
         return tok0
  }
  
-func stripCR(b []byte) []byte {
-       c := make([]byte, len(b))
-       i := 0
-       for _, ch := range b {
-               if ch != '\r' {
-                       c[i] = ch
-                       i++
-               }
-       }
-       return c[:i]
-}
-
-// Scan scans the next token and returns the token position,
-// the token, and the literal string corresponding to the
-// token. The source end is indicated by token.EOF.
+// Scan scans the next token and returns the token position, the token,
+// and its literal string if applicable. The source end is indicated by
+// token.EOF.
+//
+// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
+// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
+// has the corresponding value.
  //
  // If the returned token is token.SEMICOLON, the corresponding
  // literal string is ";" if the semicolon was present in the source,
  // and "\n" if the semicolon was inserted because of a newline or
  // at EOF.
  //
+// If the returned token is token.ILLEGAL, the literal string is the
+// offending character.
+//
+// In all other cases, Scan returns an empty literal string.
+//
  // For more tolerant parsing, Scan will return a valid token if
  // possible even if a syntax error was encountered. Thus, even
  // if the resulting token sequence contains no illegal tokens,
@@ -526,34 +551,33 @@ func stripCR(b []byte) []byte {
  // set with Init. Token positions are relative to that file
  // and thus relative to the file set.
  //
-func (S *Scanner) Scan() (token.Pos, token.Token, string) {
+func (S *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
  scanAgain:
         S.skipWhitespace()
  
         // current token start
-       insertSemi := false
-       offs := S.offset
-       tok := token.ILLEGAL
-       hasCR := false
+       pos = S.file.Pos(S.offset)
  
         // determine token value
+       insertSemi := false
         switch ch := S.ch; {
         case isLetter(ch):
-               tok = S.scanIdentifier()
+               lit = S.scanIdentifier()
+               tok = token.Lookup(lit)
                 switch tok {
                 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
                         insertSemi = true
                 }
         case digitVal(ch) < 10:
                 insertSemi = true
-               tok = S.scanNumber(false)
+               tok, lit = S.scanNumber(false)
         default:
                 S.next() // always make progress
                 switch ch {
                 case -1:
                         if S.insertSemi {
                                 S.insertSemi = false // EOF consumed
-                               return S.file.Pos(offs), token.SEMICOLON, "\n"
+                               return pos, token.SEMICOLON, "\n"
                         }
                         tok = token.EOF
                 case '\n':
@@ -561,25 +585,25 @@ scanAgain:
                         // set in the first place and exited early
                         // from S.skipWhitespace()
                         S.insertSemi = false // newline consumed
-                       return S.file.Pos(offs), token.SEMICOLON, "\n"
+                       return pos, token.SEMICOLON, "\n"
                 case '"':
                         insertSemi = true
                         tok = token.STRING
-                       S.scanString()
+                       lit = S.scanString()
                 case '\'':
                         insertSemi = true
                         tok = token.CHAR
-                       S.scanChar()
+                       lit = S.scanChar()
                 case '`':
                         insertSemi = true
                         tok = token.STRING
-                       hasCR = S.scanRawString()
+                       lit = S.scanRawString()
                 case ':':
                         tok = S.switch2(token.COLON, token.DEFINE)
                 case '.':
                         if digitVal(S.ch) < 10 {
                                 insertSemi = true
-                               tok = S.scanNumber(true)
+                               tok, lit = S.scanNumber(true)
                         } else if S.ch == '.' {
                                 S.next()
                                 if S.ch == '.' {
@@ -593,6 +617,7 @@ scanAgain:
                         tok = token.COMMA
                 case ';':
                         tok = token.SEMICOLON
+                       lit = ";"
                 case '(':
                         tok = token.LPAREN
                 case ')':
@@ -626,12 +651,12 @@ scanAgain:
                                 if S.insertSemi && S.findLineEnd() {
                                         // reset position to the beginning of the comment
                                         S.ch = '/'
-                                       S.offset = offs
-                                       S.rdOffset = offs + 1
+                                       S.offset = S.file.Offset(pos)
+                                       S.rdOffset = S.offset + 1
                                         S.insertSemi = false // newline consumed
-                                       return S.file.Pos(offs), token.SEMICOLON, "\n"
+                                       return pos, token.SEMICOLON, "\n"
                                 }
-                               S.scanComment()
+                               lit = S.scanComment()
                                 if S.mode&ScanComments == 0 {
                                         // skip comment
                                         S.insertSemi = false // newline consumed
@@ -668,21 +693,15 @@ scanAgain:
                 case '|':
                         tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
                 default:
-                       S.error(offs, fmt.Sprintf("illegal character %#U", ch))
+                       S.error(S.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
                         insertSemi = S.insertSemi // preserve insertSemi info
+                       tok = token.ILLEGAL
+                       lit = string(ch)
                 }
         }
-
         if S.mode&dontInsertSemis == 0 {
                 S.insertSemi = insertSemi
         }
  
-       // TODO(gri): The scanner API should change such that the literal string
-       //            is only valid if an actual literal was scanned. This will
-       //            permit a more efficient implementation.
-       lit := S.src[offs:S.offset]
-       if hasCR {
-               lit = stripCR(lit)
-       }
-       return S.file.Pos(offs), tok, string(lit)
+       return
  }
diff --git a/src/pkg/go/scanner/scanner_test.go b/src/pkg/go/scanner/scanner_test.go

index fd3a7cf6600c2b6fbf6255715852b5a2493857b5..2e4dd4fff638860b79aaf2a1e8a86930a014060f 100644 (file)
--- a/src/pkg/go/scanner/scanner_test.go
+++ b/src/pkg/go/scanner/scanner_test.go
@@ -177,6 +177,15 @@ var tokens = [...]elt{
  
  const whitespace = "  \t  \n\n\n" // to separate tokens
  
+var source = func() []byte {
+       var src []byte
+       for _, t := range tokens {
+               src = append(src, t.lit...)
+               src = append(src, whitespace...)
+       }
+       return src
+}()
+
  type testErrorHandler struct {
         t *testing.T
  }
@@ -214,20 +223,20 @@ func checkPos(t *testing.T, lit string, p token.Pos, expected token.Position) {
  // Verify that calling Scan() provides the correct results.
  func TestScan(t *testing.T) {
         // make source
-       var src string
-       for _, e := range tokens {
-               src += e.lit + whitespace
-       }
-       src_linecount := newlineCount(src)
+       src_linecount := newlineCount(string(source))
         whitespace_linecount := newlineCount(whitespace)
  
         // verify scan
         var s Scanner
-       s.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), &testErrorHandler{t}, ScanComments|dontInsertSemis)
+       s.Init(fset.AddFile("", fset.Base(), len(source)), source, &testErrorHandler{t}, ScanComments|dontInsertSemis)
         index := 0
         epos := token.Position{"", 0, 1, 1} // expected position
         for {
                 pos, tok, lit := s.Scan()
+               if lit == "" {
+                       // no literal value for non-literal tokens
+                       lit = tok.String()
+               }
                 e := elt{token.EOF, "", special}
                 if index < len(tokens) {
                         e = tokens[index]
@@ -659,3 +668,20 @@ func TestScanErrors(t *testing.T) {
                 checkError(t, e.src, e.tok, e.pos, e.err)
         }
  }
+
+func BenchmarkScan(b *testing.B) {
+       b.StopTimer()
+       fset := token.NewFileSet()
+       file := fset.AddFile("", fset.Base(), len(source))
+       var s Scanner
+       b.StartTimer()
+       for i := b.N - 1; i >= 0; i-- {
+               s.Init(file, source, nil, ScanComments)
+               for {
+                       _, tok, _ := s.Scan()
+                       if tok == token.EOF {
+                               break
+                       }
+               }
+       }
+}
diff --git a/src/pkg/go/token/token.go b/src/pkg/go/token/token.go

index 557374052c91ad96d010e1b3973eb7b5632fab0c..84b6314d57af1c8a623fec7b5cd155ca32318c1b 100644 (file)
--- a/src/pkg/go/token/token.go
+++ b/src/pkg/go/token/token.go
@@ -283,10 +283,8 @@ func init() {
  
  // Lookup maps an identifier to its keyword token or IDENT (if not a keyword).
  //
-func Lookup(ident []byte) Token {
-       // TODO Maps with []byte key are illegal because []byte does not
-       //      support == . Should find a more efficient solution eventually.
-       if tok, is_keyword := keywords[string(ident)]; is_keyword {
+func Lookup(ident string) Token {
+       if tok, is_keyword := keywords[ident]; is_keyword {
                 return tok
         }
         return IDENT
@@ -295,16 +293,16 @@ func Lookup(ident []byte) Token {
  // Predicates
  
  // IsLiteral returns true for tokens corresponding to identifiers
-// and basic type literals; returns false otherwise.
+// and basic type literals; it returns false otherwise.
  //
  func (tok Token) IsLiteral() bool { return literal_beg < tok && tok < literal_end }
  
  // IsOperator returns true for tokens corresponding to operators and
-// delimiters; returns false otherwise.
+// delimiters; it returns false otherwise.
  //
  func (tok Token) IsOperator() bool { return operator_beg < tok && tok < operator_end }
  
  // IsKeyword returns true for tokens corresponding to keywords;
-// returns false otherwise.
+// it returns false otherwise.
  //
  func (tok Token) IsKeyword() bool { return keyword_beg < tok && tok < keyword_end }
author	Robert Griesemer <gri@golang.org>
	Wed, 11 Jan 2012 22:20:32 +0000 (14:20 -0800)
committer	Robert Griesemer <gri@golang.org>
	Wed, 11 Jan 2012 22:20:32 +0000 (14:20 -0800)
src/cmd/cgo/gcc.go		patch \| blob \| history
src/pkg/go/scanner/scanner.go		patch \| blob \| history
src/pkg/go/scanner/scanner_test.go		patch \| blob \| history
src/pkg/go/token/token.go		patch \| blob \| history