cmd/compile/internal/syntax: faster and simpler source reader

author Robert Griesemer <gri@golang.org>

Thu, 27 Feb 2020 05:31:00 +0000 (21:31 -0800)

committer Robert Griesemer <gri@golang.org>

Thu, 5 Mar 2020 19:25:46 +0000 (19:25 +0000)
author Robert Griesemer <gri@golang.org>
Thu, 27 Feb 2020 05:31:00 +0000 (21:31 -0800)
committer Robert Griesemer <gri@golang.org>
Thu, 5 Mar 2020 19:25:46 +0000 (19:25 +0000)
diff --git a/src/cmd/compile/internal/syntax/parser.go b/src/cmd/compile/internal/syntax/parser.go

index 469d9ad69baed86264488e9323cfb5d419a61654..5e52800b396029c03bd160ab470569133923df47 100644 (file)
--- a/src/cmd/compile/internal/syntax/parser.go
+++ b/src/cmd/compile/internal/syntax/parser.go
@@ -419,7 +419,7 @@ func (p *parser) fileOrNil() *File {
         }
         // p.tok == _EOF
  
-       f.Lines = p.source.line
+       f.Lines = p.line
  
         return f
  }
diff --git a/src/cmd/compile/internal/syntax/scanner.go b/src/cmd/compile/internal/syntax/scanner.go

index f2f6fd2bb6b333c94a2b5714f9fa2998f4dde096..2ce6203dd9802bfe9bf3361175229aefd32a57ad 100644 (file)
--- a/src/cmd/compile/internal/syntax/scanner.go
+++ b/src/cmd/compile/internal/syntax/scanner.go
@@ -6,9 +6,9 @@
  // Go source. After initialization, consecutive calls of
  // next advance the scanner one token at a time.
  //
-// This file, source.go, and tokens.go are self-contained
-// (go tool compile scanner.go source.go tokens.go compiles)
-// and thus could be made into its own package.
+// This file, source.go, tokens.go, and token_string.go are self-contained
+// (`go tool compile scanner.go source.go tokens.go token_string.go` compiles)
+// and thus could be made into their own package.
  
  package syntax
  
@@ -86,20 +86,21 @@ func (s *scanner) next() {
  
  redo:
         // skip white space
-       c := s.getr()
-       for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
-               c = s.getr()
+       s.stop()
+       for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
+               s.nextch()
         }
  
         // token start
-       s.line, s.col = s.source.line0, s.source.col0
-
-       if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) {
+       s.line, s.col = s.pos()
+       s.start()
+       if isLetter(s.ch) || s.ch >= utf8.RuneSelf && s.atIdentChar(true) {
+               s.nextch()
                 s.ident()
                 return
         }
  
-       switch c {
+       switch s.ch {
         case -1:
                 if nlsemi {
                         s.lit = "EOF"
@@ -109,11 +110,12 @@ redo:
                 s.tok = _EOF
  
         case '\n':
+               s.nextch()
                 s.lit = "newline"
                 s.tok = _Semi
  
         case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
-               s.number(c)
+               s.number(false)
  
         case '"':
                 s.stdString()
@@ -125,97 +127,110 @@ redo:
                 s.rune()
  
         case '(':
+               s.nextch()
                 s.tok = _Lparen
  
         case '[':
+               s.nextch()
                 s.tok = _Lbrack
  
         case '{':
+               s.nextch()
                 s.tok = _Lbrace
  
         case ',':
+               s.nextch()
                 s.tok = _Comma
  
         case ';':
+               s.nextch()
                 s.lit = "semicolon"
                 s.tok = _Semi
  
         case ')':
+               s.nextch()
                 s.nlsemi = true
                 s.tok = _Rparen
  
         case ']':
+               s.nextch()
                 s.nlsemi = true
                 s.tok = _Rbrack
  
         case '}':
+               s.nextch()
                 s.nlsemi = true
                 s.tok = _Rbrace
  
         case ':':
-               if s.getr() == '=' {
+               s.nextch()
+               if s.ch == '=' {
+                       s.nextch()
                         s.tok = _Define
                         break
                 }
-               s.ungetr()
                 s.tok = _Colon
  
         case '.':
-               c = s.getr()
-               if isDecimal(c) {
-                       s.ungetr()
-                       s.unread(1) // correct position of '.' (needed by startLit in number)
-                       s.number('.')
+               s.nextch()
+               if isDecimal(s.ch) {
+                       s.number(true)
                         break
                 }
-               if c == '.' {
-                       c = s.getr()
-                       if c == '.' {
+               if s.ch == '.' {
+                       s.nextch()
+                       if s.ch == '.' {
+                               s.nextch()
                                 s.tok = _DotDotDot
                                 break
                         }
-                       s.unread(1)
+                       s.rewind() // now s.ch holds 1st '.'
+                       s.nextch() // consume 1st '.' again
                 }
-               s.ungetr()
                 s.tok = _Dot
  
         case '+':
+               s.nextch()
                 s.op, s.prec = Add, precAdd
-               c = s.getr()
-               if c != '+' {
+               if s.ch != '+' {
                         goto assignop
                 }
+               s.nextch()
                 s.nlsemi = true
                 s.tok = _IncOp
  
         case '-':
+               s.nextch()
                 s.op, s.prec = Sub, precAdd
-               c = s.getr()
-               if c != '-' {
+               if s.ch != '-' {
                         goto assignop
                 }
+               s.nextch()
                 s.nlsemi = true
                 s.tok = _IncOp
  
         case '*':
+               s.nextch()
                 s.op, s.prec = Mul, precMul
                 // don't goto assignop - want _Star token
-               if s.getr() == '=' {
+               if s.ch == '=' {
+                       s.nextch()
                         s.tok = _AssignOp
                         break
                 }
-               s.ungetr()
                 s.tok = _Star
  
         case '/':
-               c = s.getr()
-               if c == '/' {
+               s.nextch()
+               if s.ch == '/' {
+                       s.nextch()
                         s.lineComment()
                         goto redo
                 }
-               if c == '*' {
+               if s.ch == '*' {
+                       s.nextch()
                         s.fullComment()
-                       if s.source.line > s.line && nlsemi {
+                       if line, _ := s.pos(); line > s.line && nlsemi {
                                 // A multi-line comment acts like a newline;
                                 // it translates to a ';' if nlsemi is set.
                                 s.lit = "newline"
@@ -228,27 +243,29 @@ redo:
                 goto assignop
  
         case '%':
+               s.nextch()
                 s.op, s.prec = Rem, precMul
-               c = s.getr()
                 goto assignop
  
         case '&':
-               c = s.getr()
-               if c == '&' {
+               s.nextch()
+               if s.ch == '&' {
+                       s.nextch()
                         s.op, s.prec = AndAnd, precAndAnd
                         s.tok = _Operator
                         break
                 }
                 s.op, s.prec = And, precMul
-               if c == '^' {
+               if s.ch == '^' {
+                       s.nextch()
                         s.op = AndNot
-                       c = s.getr()
                 }
                 goto assignop
  
         case '|':
-               c = s.getr()
-               if c == '|' {
+               s.nextch()
+               if s.ch == '|' {
+                       s.nextch()
                         s.op, s.prec = OrOr, precOrOr
                         s.tok = _Operator
                         break
@@ -257,106 +274,100 @@ redo:
                 goto assignop
  
         case '^':
+               s.nextch()
                 s.op, s.prec = Xor, precAdd
-               c = s.getr()
                 goto assignop
  
         case '<':
-               c = s.getr()
-               if c == '=' {
+               s.nextch()
+               if s.ch == '=' {
+                       s.nextch()
                         s.op, s.prec = Leq, precCmp
                         s.tok = _Operator
                         break
                 }
-               if c == '<' {
+               if s.ch == '<' {
+                       s.nextch()
                         s.op, s.prec = Shl, precMul
-                       c = s.getr()
                         goto assignop
                 }
-               if c == '-' {
+               if s.ch == '-' {
+                       s.nextch()
                         s.tok = _Arrow
                         break
                 }
-               s.ungetr()
                 s.op, s.prec = Lss, precCmp
                 s.tok = _Operator
  
         case '>':
-               c = s.getr()
-               if c == '=' {
+               s.nextch()
+               if s.ch == '=' {
+                       s.nextch()
                         s.op, s.prec = Geq, precCmp
                         s.tok = _Operator
                         break
                 }
-               if c == '>' {
+               if s.ch == '>' {
+                       s.nextch()
                         s.op, s.prec = Shr, precMul
-                       c = s.getr()
                         goto assignop
                 }
-               s.ungetr()
                 s.op, s.prec = Gtr, precCmp
                 s.tok = _Operator
  
         case '=':
-               if s.getr() == '=' {
+               s.nextch()
+               if s.ch == '=' {
+                       s.nextch()
                         s.op, s.prec = Eql, precCmp
                         s.tok = _Operator
                         break
                 }
-               s.ungetr()
                 s.tok = _Assign
  
         case '!':
-               if s.getr() == '=' {
+               s.nextch()
+               if s.ch == '=' {
+                       s.nextch()
                         s.op, s.prec = Neq, precCmp
                         s.tok = _Operator
                         break
                 }
-               s.ungetr()
                 s.op, s.prec = Not, 0
                 s.tok = _Operator
  
         default:
-               s.tok = 0
-               s.errorf("invalid character %#U", c)
+               s.errorf("invalid character %#U", s.ch)
+               s.nextch()
                 goto redo
         }
  
         return
  
  assignop:
-       if c == '=' {
+       if s.ch == '=' {
+               s.nextch()
                 s.tok = _AssignOp
                 return
         }
-       s.ungetr()
         s.tok = _Operator
  }
  
-func isLetter(c rune) bool {
-       return 'a' <= lower(c) && lower(c) <= 'z' || c == '_'
-}
-
  func (s *scanner) ident() {
-       s.startLit()
-
         // accelerate common case (7bit ASCII)
-       c := s.getr()
-       for isLetter(c) || isDecimal(c) {
-               c = s.getr()
+       for isLetter(s.ch) || isDecimal(s.ch) {
+               s.nextch()
         }
  
         // general case
-       if c >= utf8.RuneSelf {
-               for s.isIdentRune(c, false) {
-                       c = s.getr()
+       if s.ch >= utf8.RuneSelf {
+               for s.atIdentChar(false) {
+                       s.nextch()
                 }
         }
-       s.ungetr()
-
-       lit := s.stopLit()
  
         // possibly a keyword
+       lit := s.segment()
         if len(lit) >= 2 {
                 if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
                         s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
@@ -376,16 +387,16 @@ func tokStrFast(tok token) string {
         return _token_name[_token_index[tok-1]:_token_index[tok]]
  }
  
-func (s *scanner) isIdentRune(c rune, first bool) bool {
+func (s *scanner) atIdentChar(first bool) bool {
         switch {
-       case unicode.IsLetter(c) || c == '_':
+       case unicode.IsLetter(s.ch) || s.ch == '_':
                 // ok
-       case unicode.IsDigit(c):
+       case unicode.IsDigit(s.ch):
                 if first {
-                       s.errorf("identifier cannot begin with digit %#U", c)
+                       s.errorf("identifier cannot begin with digit %#U", s.ch)
                 }
-       case c >= utf8.RuneSelf:
-               s.errorf("invalid character %#U in identifier", c)
+       case s.ch >= utf8.RuneSelf:
+               s.errorf("invalid character %#U in identifier", s.ch)
         default:
                 return false
         }
@@ -411,46 +422,45 @@ func init() {
         }
  }
  
-func lower(c rune) rune     { return ('a' - 'A') | c } // returns lower-case c iff c is ASCII letter
-func isDecimal(c rune) bool { return '0' <= c && c <= '9' }
-func isHex(c rune) bool     { return '0' <= c && c <= '9' || 'a' <= lower(c) && lower(c) <= 'f' }
+func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
+func isLetter(ch rune) bool  { return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' }
+func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
+func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
  
-// digits accepts the sequence { digit | '_' } starting with c0.
+// digits accepts the sequence { digit | '_' }.
  // If base <= 10, digits accepts any decimal digit but records
  // the index (relative to the literal start) of a digit >= base
  // in *invalid, if *invalid < 0.
-// digits returns the first rune that is not part of the sequence
-// anymore, and a bitset describing whether the sequence contained
+// digits returns a bitset describing whether the sequence contained
  // digits (bit 0 is set), or separators '_' (bit 1 is set).
-func (s *scanner) digits(c0 rune, base int, invalid *int) (c rune, digsep int) {
-       c = c0
+func (s *scanner) digits(base int, invalid *int) (digsep int) {
         if base <= 10 {
                 max := rune('0' + base)
-               for isDecimal(c) || c == '_' {
+               for isDecimal(s.ch) || s.ch == '_' {
                         ds := 1
-                       if c == '_' {
+                       if s.ch == '_' {
                                 ds = 2
-                       } else if c >= max && *invalid < 0 {
-                               *invalid = int(s.col0 - s.col) // record invalid rune index
+                       } else if s.ch >= max && *invalid < 0 {
+                               _, col := s.pos()
+                               *invalid = int(col - s.col) // record invalid rune index
                         }
                         digsep |= ds
-                       c = s.getr()
+                       s.nextch()
                 }
         } else {
-               for isHex(c) || c == '_' {
+               for isHex(s.ch) || s.ch == '_' {
                         ds := 1
-                       if c == '_' {
+                       if s.ch == '_' {
                                 ds = 2
                         }
                         digsep |= ds
-                       c = s.getr()
+                       s.nextch()
                 }
         }
         return
  }
  
-func (s *scanner) number(c rune) {
-       s.startLit()
+func (s *scanner) number(seenPoint bool) {
         s.bad = false
  
         base := 10        // number base
@@ -459,38 +469,39 @@ func (s *scanner) number(c rune) {
         invalid := -1     // index of invalid digit in literal, or < 0
  
         // integer part
-       var ds int
-       if c != '.' {
+       if !seenPoint {
                 s.kind = IntLit
-               if c == '0' {
-                       c = s.getr()
-                       switch lower(c) {
+               if s.ch == '0' {
+                       s.nextch()
+                       switch lower(s.ch) {
                         case 'x':
-                               c = s.getr()
+                               s.nextch()
                                 base, prefix = 16, 'x'
                         case 'o':
-                               c = s.getr()
+                               s.nextch()
                                 base, prefix = 8, 'o'
                         case 'b':
-                               c = s.getr()
+                               s.nextch()
                                 base, prefix = 2, 'b'
                         default:
                                 base, prefix = 8, '0'
                                 digsep = 1 // leading 0
                         }
                 }
-               c, ds = s.digits(c, base, &invalid)
-               digsep |= ds
+               digsep |= s.digits(base, &invalid)
+               if s.ch == '.' {
+                       if prefix == 'o' || prefix == 'b' {
+                               s.errorf("invalid radix point in %s", litname(prefix))
+                       }
+                       s.nextch()
+                       seenPoint = true
+               }
         }
  
         // fractional part
-       if c == '.' {
+       if seenPoint {
                 s.kind = FloatLit
-               if prefix == 'o' || prefix == 'b' {
-                       s.errorf("invalid radix point in %s", litname(prefix))
-               }
-               c, ds = s.digits(s.getr(), base, &invalid)
-               digsep |= ds
+               digsep |= s.digits(base, &invalid)
         }
  
         if digsep&1 == 0 && !s.bad {
@@ -498,23 +509,22 @@ func (s *scanner) number(c rune) {
         }
  
         // exponent
-       if e := lower(c); e == 'e' || e == 'p' {
+       if e := lower(s.ch); e == 'e' || e == 'p' {
                 if !s.bad {
                         switch {
                         case e == 'e' && prefix != 0 && prefix != '0':
-                               s.errorf("%q exponent requires decimal mantissa", c)
+                               s.errorf("%q exponent requires decimal mantissa", s.ch)
                         case e == 'p' && prefix != 'x':
-                               s.errorf("%q exponent requires hexadecimal mantissa", c)
+                               s.errorf("%q exponent requires hexadecimal mantissa", s.ch)
                         }
                 }
-               c = s.getr()
+               s.nextch()
                 s.kind = FloatLit
-               if c == '+' || c == '-' {
-                       c = s.getr()
+               if s.ch == '+' || s.ch == '-' {
+                       s.nextch()
                 }
-               c, ds = s.digits(c, 10, nil)
-               digsep |= ds
-               if ds&1 == 0 && !s.bad {
+               digsep = s.digits(10, nil) | digsep&2 // don't lose sep bit
+               if digsep&1 == 0 && !s.bad {
                         s.errorf("exponent has no digits")
                 }
         } else if prefix == 'x' && s.kind == FloatLit && !s.bad {
@@ -522,14 +532,13 @@ func (s *scanner) number(c rune) {
         }
  
         // suffix 'i'
-       if c == 'i' {
+       if s.ch == 'i' {
                 s.kind = ImagLit
-               c = s.getr()
+               s.nextch()
         }
-       s.ungetr()
  
         s.nlsemi = true
-       s.lit = string(s.stopLit())
+       s.lit = string(s.segment())
         s.tok = _Literal
  
         if s.kind == IntLit && invalid >= 0 && !s.bad {
@@ -596,199 +605,195 @@ func invalidSep(x string) int {
  }
  
  func (s *scanner) rune() {
-       s.startLit()
         s.bad = false
+       s.nextch()
  
         n := 0
         for ; ; n++ {
-               r := s.getr()
-               if r == '\'' {
+               if s.ch == '\'' {
+                       if !s.bad {
+                               if n == 0 {
+                                       s.errorf("empty rune literal or unescaped '")
+                               } else if n != 1 {
+                                       s.errorAtf(0, "more than one character in rune literal")
+                               }
+                       }
+                       s.nextch()
                         break
                 }
-               if r == '\\' {
+               if s.ch == '\\' {
+                       s.nextch()
                         s.escape('\'')
                         continue
                 }
-               if r == '\n' {
-                       s.ungetr() // assume newline is not part of literal
+               if s.ch == '\n' {
                         if !s.bad {
                                 s.errorf("newline in rune literal")
                         }
                         break
                 }
-               if r < 0 {
+               if s.ch < 0 {
                         if !s.bad {
                                 s.errorAtf(0, "rune literal not terminated")
                         }
                         break
                 }
-       }
-
-       if !s.bad {
-               if n == 0 {
-                       s.errorf("empty rune literal or unescaped '")
-               } else if n != 1 {
-                       s.errorAtf(0, "more than one character in rune literal")
-               }
+               s.nextch()
         }
  
         s.nlsemi = true
-       s.lit = string(s.stopLit())
+       s.lit = string(s.segment())
         s.kind = RuneLit
         s.tok = _Literal
  }
  
  func (s *scanner) stdString() {
-       s.startLit()
         s.bad = false
+       s.nextch()
  
         for {
-               r := s.getr()
-               if r == '"' {
+               if s.ch == '"' {
+                       s.nextch()
                         break
                 }
-               if r == '\\' {
+               if s.ch == '\\' {
+                       s.nextch()
                         s.escape('"')
                         continue
                 }
-               if r == '\n' {
-                       s.ungetr() // assume newline is not part of literal
+               if s.ch == '\n' {
                         s.errorf("newline in string")
                         break
                 }
-               if r < 0 {
+               if s.ch < 0 {
                         s.errorAtf(0, "string not terminated")
                         break
                 }
+               s.nextch()
         }
  
         s.nlsemi = true
-       s.lit = string(s.stopLit())
+       s.lit = string(s.segment())
         s.kind = StringLit
         s.tok = _Literal
  }
  
  func (s *scanner) rawString() {
-       s.startLit()
         s.bad = false
+       s.nextch()
  
         for {
-               r := s.getr()
-               if r == '`' {
+               if s.ch == '`' {
+                       s.nextch()
                         break
                 }
-               if r < 0 {
+               if s.ch < 0 {
                         s.errorAtf(0, "string not terminated")
                         break
                 }
+               s.nextch()
         }
         // We leave CRs in the string since they are part of the
         // literal (even though they are not part of the literal
         // value).
  
         s.nlsemi = true
-       s.lit = string(s.stopLit())
+       s.lit = string(s.segment())
         s.kind = StringLit
         s.tok = _Literal
  }
  
  func (s *scanner) comment(text string) {
-       s.errh(s.line, s.col, text)
+       s.errorAtf(0, text)
  }
  
-func (s *scanner) skipLine(r rune) {
-       for r >= 0 {
-               if r == '\n' {
-                       s.ungetr() // don't consume '\n' - needed for nlsemi logic
-                       break
-               }
-               r = s.getr()
+func (s *scanner) skipLine() {
+       // don't consume '\n' - needed for nlsemi logic
+       for s.ch >= 0 && s.ch != '\n' {
+               s.nextch()
         }
  }
  
  func (s *scanner) lineComment() {
-       r := s.getr()
+       // opening has already been consumed
  
         if s.mode&comments != 0 {
-               s.startLit()
-               s.skipLine(r)
-               s.comment("//" + string(s.stopLit()))
+               s.skipLine()
+               s.comment(string(s.segment()))
                 return
         }
  
         // directives must start at the beginning of the line (s.col == colbase)
-       if s.mode&directives == 0 || s.col != colbase || (r != 'g' && r != 'l') {
-               s.skipLine(r)
+       if s.mode&directives == 0 || s.col != colbase || (s.ch != 'g' && s.ch != 'l') {
+               s.stop()
+               s.skipLine()
                 return
         }
  
         // recognize go: or line directives
         prefix := "go:"
-       if r == 'l' {
+       if s.ch == 'l' {
                 prefix = "line "
         }
         for _, m := range prefix {
-               if r != m {
-                       s.skipLine(r)
+               if s.ch != m {
+                       s.stop()
+                       s.skipLine()
                         return
                 }
-               r = s.getr()
+               s.nextch()
         }
  
         // directive text
-       s.startLit()
-       s.skipLine(r)
-       s.comment("//" + prefix + string(s.stopLit()))
+       s.skipLine()
+       s.comment(string(s.segment()))
  }
  
-func (s *scanner) skipComment(r rune) bool {
-       for r >= 0 {
-               for r == '*' {
-                       r = s.getr()
-                       if r == '/' {
+func (s *scanner) skipComment() bool {
+       for s.ch >= 0 {
+               for s.ch == '*' {
+                       s.nextch()
+                       if s.ch == '/' {
+                               s.nextch()
                                 return true
                         }
                 }
-               r = s.getr()
+               s.nextch()
         }
         s.errorAtf(0, "comment not terminated")
         return false
  }
  
  func (s *scanner) fullComment() {
-       r := s.getr()
+       /* opening has already been consumed */
  
         if s.mode&comments != 0 {
-               s.startLit()
-               if s.skipComment(r) {
-                       s.comment("/*" + string(s.stopLit()))
-               } else {
-                       s.killLit() // not a complete comment - ignore
+               if s.skipComment() {
+                       s.comment(string(s.segment()))
                 }
                 return
         }
  
-       if s.mode&directives == 0 || r != 'l' {
-               s.skipComment(r)
+       if s.mode&directives == 0 || s.ch != 'l' {
+               s.stop()
+               s.skipComment()
                 return
         }
  
         // recognize line directive
         const prefix = "line "
         for _, m := range prefix {
-               if r != m {
-                       s.skipComment(r)
+               if s.ch != m {
+                       s.stop()
+                       s.skipComment()
                         return
                 }
-               r = s.getr()
+               s.nextch()
         }
  
         // directive text
-       s.startLit()
-       if s.skipComment(r) {
-               s.comment("/*" + prefix + string(s.stopLit()))
-       } else {
-               s.killLit() // not a complete comment - ignore
+       if s.skipComment() {
+               s.comment(string(s.segment()))
         }
  }
  
@@ -796,23 +801,23 @@ func (s *scanner) escape(quote rune) {
         var n int
         var base, max uint32
  
-       c := s.getr()
-       switch c {
-       case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
+       switch s.ch {
+       case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
+               s.nextch()
                 return
         case '0', '1', '2', '3', '4', '5', '6', '7':
                 n, base, max = 3, 8, 255
         case 'x':
-               c = s.getr()
+               s.nextch()
                 n, base, max = 2, 16, 255
         case 'u':
-               c = s.getr()
+               s.nextch()
                 n, base, max = 4, 16, unicode.MaxRune
         case 'U':
-               c = s.getr()
+               s.nextch()
                 n, base, max = 8, 16, unicode.MaxRune
         default:
-               if c < 0 {
+               if s.ch < 0 {
                         return // complain in caller about EOF
                 }
                 s.errorf("unknown escape")
@@ -821,30 +826,27 @@ func (s *scanner) escape(quote rune) {
  
         var x uint32
         for i := n; i > 0; i-- {
+               if s.ch < 0 {
+                       return // complain in caller about EOF
+               }
                 d := base
-               switch {
-               case isDecimal(c):
-                       d = uint32(c) - '0'
-               case 'a' <= lower(c) && lower(c) <= 'f':
-                       d = uint32(lower(c)) - ('a' - 10)
+               if isDecimal(s.ch) {
+                       d = uint32(s.ch) - '0'
+               } else if 'a' <= lower(s.ch) && lower(s.ch) <= 'f' {
+                       d = uint32(lower(s.ch)) - 'a' + 10
                 }
                 if d >= base {
-                       if c < 0 {
-                               return // complain in caller about EOF
-                       }
                         kind := "hex"
                         if base == 8 {
                                 kind = "octal"
                         }
-                       s.errorf("invalid character %q in %s escape", c, kind)
-                       s.ungetr()
+                       s.errorf("invalid character %q in %s escape", s.ch, kind)
                         return
                 }
                 // d < base
                 x = x*base + d
-               c = s.getr()
+               s.nextch()
         }
-       s.ungetr()
  
         if x > max && base == 8 {
                 s.errorf("octal escape value %d > 255", x)
diff --git a/src/cmd/compile/internal/syntax/scanner_test.go b/src/cmd/compile/internal/syntax/scanner_test.go

index f68334165043cd921af6a05fe4e80fe930965615..78e470c45cebf46eb7a47efe9a858a5197a32028 100644 (file)
--- a/src/cmd/compile/internal/syntax/scanner_test.go
+++ b/src/cmd/compile/internal/syntax/scanner_test.go
@@ -19,8 +19,8 @@ func errh(line, col uint, msg string) {
  
  // Don't bother with other tests if TestSmoke doesn't pass.
  func TestSmoke(t *testing.T) {
-       const src = "if (+foo\t+=..123/***/4.2_0e-0i'a'`raw`\"string\" ;//$"
-       tokens := []token{_If, _Lparen, _Operator, _Name, _AssignOp, _Dot, _Literal, _Literal, _Literal, _Literal, _Literal, _Semi, _EOF}
+       const src = "if (+foo\t+=..123/***/0.9_0e-0i'a'`raw`\"string\"..f;//$"
+       tokens := []token{_If, _Lparen, _Operator, _Name, _AssignOp, _Dot, _Literal, _Literal, _Literal, _Literal, _Literal, _Dot, _Dot, _Name, _Semi, _EOF}
  
         var got scanner
         got.init(strings.NewReader(src), errh, 0)
diff --git a/src/cmd/compile/internal/syntax/source.go b/src/cmd/compile/internal/syntax/source.go

index c671e3c11eaac9649a50c4fe8375f10deed779f0..01b592152bb7886885e2b870f513d81d7d20ead0 100644 (file)
--- a/src/cmd/compile/internal/syntax/source.go
+++ b/src/cmd/compile/internal/syntax/source.go
@@ -3,11 +3,10 @@
  // license that can be found in the LICENSE file.
  
  // This file implements source, a buffered rune reader
-// which is specialized for the needs of the Go scanner:
-// Contiguous sequences of runes (literals) are extracted
-// directly as []byte without the need to re-encode the
-// runes in UTF-8 (as would be necessary with bufio.Reader).
-//
+// specialized for scanning Go code: Reading
+// ASCII characters, maintaining current (line, col)
+// position information, and recording of the most
+// recently read source segment are highly optimized.
  // This file is self-contained (go tool compile source.go
  // compiles) and thus could be made into its own package.
  
@@ -18,202 +17,202 @@ import (
         "unicode/utf8"
  )
  
-// starting points for line and column numbers
-const linebase = 1
-const colbase = 1
-
-// max. number of bytes to unread
-const maxunread = 10
-
-// buf [...read...|...|...unread...|s|...free...]
-//         ^      ^   ^            ^
-//         |      |   |            |
-//        suf     r0  r            w
+// The source buffer is accessed using three indices b (begin),
+// r (read), and e (end):
+//
+// - If b >= 0, it points to the beginning of a segment of most
+//   recently read characters (typically a Go literal).
+//
+// - r points to the byte immediately following the most recently
+//   read character ch, which starts at r-chw.
+//
+// - e points to the byte immediately following the last byte that
+//   was read into the buffer.
+//
+// The buffer content is terminated at buf[e] with the sentinel
+// character utf8.RuneSelf. This makes it possible to test for
+// the common case of ASCII characters with a single 'if' (see
+// nextch method).
+//
+//                +------ content in use -------+
+//                v                             v
+// buf [...read...|...segment...|ch|...unread...|s|...free...]
+//                ^             ^  ^            ^
+//                |             |  |            |
+//                b         r-chw  r            e
+//
+// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
  
  type source struct {
-       src  io.Reader
-       errh func(line, pos uint, msg string)
-
-       // source buffer
-       buf         [4 << 10]byte
-       r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
-       line0, line uint  // previous/current line
-       col0, col   uint  // previous/current column (byte offsets from line start)
-       ioerr       error // pending io error
-
-       // literal buffer
-       lit []byte // literal prefix
-       suf int    // literal suffix; suf >= 0 means we are scanning a literal
+       in   io.Reader
+       errh func(line, col uint, msg string)
+
+       buf       []byte // source buffer
+       ioerr     error  // pending I/O error, or nil
+       b, r, e   int    // buffer indices (see comment above)
+       line, col uint   // source position of ch (0-based)
+       ch        rune   // most recently read character
+       chw       int    // width of ch
  }
  
-// init initializes source to read from src and to report errors via errh.
-// errh must not be nil.
-func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
-       s.src = src
+const sentinel = utf8.RuneSelf
+
+func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) {
+       s.in = in
         s.errh = errh
  
-       s.buf[0] = utf8.RuneSelf // terminate with sentinel
-       s.r0, s.r, s.w = 0, 0, 0
-       s.line0, s.line = 0, linebase
-       s.col0, s.col = 0, colbase
+       if s.buf == nil {
+               s.buf = make([]byte, nextSize(0))
+       }
+       s.buf[0] = sentinel
         s.ioerr = nil
-
-       s.lit = s.lit[:0]
-       s.suf = -1
+       s.b, s.r, s.e = -1, 0, 0
+       s.line, s.col = 0, 0
+       s.ch = ' '
+       s.chw = 0
  }
  
-// ungetr sets the reading position to a previous reading
-// position, usually the one of the most recently read
-// rune, but possibly earlier (see unread below).
-func (s *source) ungetr() {
-       s.r, s.line, s.col = s.r0, s.line0, s.col0
-}
+// starting points for line and column numbers
+const linebase = 1
+const colbase = 1
  
-// unread moves the previous reading position to a position
-// that is n bytes earlier in the source. The next ungetr
-// call will set the reading position to that moved position.
-// The "unread" runes must be single byte and not contain any
-// newlines; and 0 <= n <= maxunread must hold.
-func (s *source) unread(n int) {
-       s.r0 -= n
-       s.col0 -= uint(n)
+// pos returns the (line, col) source position of s.ch.
+func (s *source) pos() (line, col uint) {
+       return linebase + s.line, colbase + s.col
  }
  
+// error reports the error msg at source position s.pos().
  func (s *source) error(msg string) {
-       s.errh(s.line0, s.col0, msg)
+       line, col := s.pos()
+       s.errh(line, col, msg)
  }
  
-// getr reads and returns the next rune.
-//
-// If a read or source encoding error occurs, getr
-// calls the error handler installed with init.
-// The handler must exist.
-//
-// The (line, col) position passed to the error handler
-// is always at the current source reading position.
-func (s *source) getr() rune {
-redo:
-       s.r0, s.line0, s.col0 = s.r, s.line, s.col
-
-       // We could avoid at least one test that is always taken in the
-       // for loop below by duplicating the common case code (ASCII)
-       // here since we always have at least the sentinel (utf8.RuneSelf)
-       // in the buffer. Measure and optimize if necessary.
+// start starts a new active source segment (including s.ch).
+// As long as stop has not been called, the active segment's
+// bytes (excluding s.ch) may be retrieved by calling segment.
+func (s *source) start()          { s.b = s.r - s.chw }
+func (s *source) stop()           { s.b = -1 }
+func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
+
+// rewind rewinds the scanner's read position and character s.ch
+// to the start of the currently active segment, which must not
+// contain any newlines (otherwise position information will be
+// incorrect). Currently, rewind is only needed for handling the
+// source sequence ".."; it must not be called outside an active
+// segment.
+func (s *source) rewind() {
+       // ok to verify precondition - rewind is rarely called
+       if s.b < 0 {
+               panic("no active segment")
+       }
+       s.col -= uint(s.r - s.b)
+       s.r = s.b
+       s.nextch()
+}
  
-       // make sure we have at least one rune in buffer, or we are at EOF
-       for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
-               s.fill() // s.w-s.r < len(s.buf) => buffer is not full
+func (s *source) nextch() {
+redo:
+       s.col += uint(s.chw)
+       if s.ch == '\n' {
+               s.line++
+               s.col = 0
         }
  
-       // common case: ASCII and enough bytes
-       // (invariant: s.buf[s.w] == utf8.RuneSelf)
-       if b := s.buf[s.r]; b < utf8.RuneSelf {
+       // fast common case: at least one ASCII character
+       if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
                 s.r++
-               // TODO(gri) Optimization: Instead of adjusting s.col for each character,
-               // remember the line offset instead and then compute the offset as needed
-               // (which is less often).
-               s.col++
-               if b == 0 {
+               s.chw = 1
+               if s.ch == 0 {
                         s.error("invalid NUL character")
                         goto redo
                 }
-               if b == '\n' {
-                       s.line++
-                       s.col = colbase
-               }
-               return rune(b)
+               return
+       }
+
+       // slower general case: add more bytes to buffer if we don't have a full rune
+       for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
+               s.fill()
         }
  
         // EOF
-       if s.r == s.w {
+       if s.r == s.e {
                 if s.ioerr != io.EOF {
                         // ensure we never start with a '/' (e.g., rooted path) in the error message
                         s.error("I/O error: " + s.ioerr.Error())
+                       s.ioerr = nil
                 }
-               return -1
+               s.ch = -1
+               s.chw = 0
+               return
         }
  
-       // uncommon case: not ASCII
-       r, w := utf8.DecodeRune(s.buf[s.r:s.w])
-       s.r += w
-       s.col += uint(w)
+       s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e])
+       s.r += s.chw
  
-       if r == utf8.RuneError && w == 1 {
+       if s.ch == utf8.RuneError && s.chw == 1 {
                 s.error("invalid UTF-8 encoding")
                 goto redo
         }
  
         // BOM's are only allowed as the first character in a file
         const BOM = 0xfeff
-       if r == BOM {
-               if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to maxunread)
+       if s.ch == BOM {
+               if s.line > 0 || s.col > 0 {
                         s.error("invalid BOM in the middle of the file")
                 }
                 goto redo
         }
-
-       return r
  }
  
+// fill reads more source bytes into s.buf.
+// It returns with at least one more byte in the buffer, or with s.ioerr != nil.
  func (s *source) fill() {
-       // Slide unread bytes to beginning but preserve last read char
-       // (for one ungetr call) plus maxunread extra bytes (for one
-       // unread call).
-       if s.r0 > maxunread {
-               n := s.r0 - maxunread // number of bytes to slide down
-               // save literal prefix, if any
-               // (make sure we keep maxunread bytes and the last
-               // read char in the buffer)
-               if s.suf >= 0 {
-                       // we have a literal
-                       if s.suf < n {
-                               // save literal prefix
-                               s.lit = append(s.lit, s.buf[s.suf:n]...)
-                               s.suf = 0
-                       } else {
-                               s.suf -= n
-                       }
-               }
-               copy(s.buf[:], s.buf[n:s.w])
-               s.r0 = maxunread // eqv: s.r0 -= n
-               s.r -= n
-               s.w -= n
+       // determine content to preserve
+       b := s.r
+       if s.b >= 0 {
+               b = s.b
+               s.b = 0 // after buffer has grown or content has been moved down
         }
+       content := s.buf[b:s.e]
+
+       // grow buffer or move content down
+       if len(content)*2 > len(s.buf) {
+               s.buf = make([]byte, nextSize(len(s.buf)))
+               copy(s.buf, content)
+       } else if b > 0 {
+               copy(s.buf, content)
+       }
+       s.r -= b
+       s.e -= b
  
         // read more data: try a limited number of times
-       for i := 100; i > 0; i-- {
-               n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
+       for i := 0; i < 10; i++ {
+               var n int
+               n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel
                 if n < 0 {
                         panic("negative read") // incorrect underlying io.Reader implementation
                 }
-               s.w += n
-               if n > 0 || err != nil {
-                       s.buf[s.w] = utf8.RuneSelf // sentinel
-                       if err != nil {
-                               s.ioerr = err
-                       }
+               if n > 0 || s.ioerr != nil {
+                       s.e += n
+                       s.buf[s.e] = sentinel
                         return
                 }
+               // n == 0
         }
  
-       s.buf[s.w] = utf8.RuneSelf // sentinel
+       s.buf[s.e] = sentinel
         s.ioerr = io.ErrNoProgress
  }
  
-func (s *source) startLit() {
-       s.suf = s.r0
-       s.lit = s.lit[:0] // reuse lit
-}
-
-func (s *source) stopLit() []byte {
-       lit := s.buf[s.suf:s.r]
-       if len(s.lit) > 0 {
-               lit = append(s.lit, lit...)
+// nextSize returns the next bigger size for a buffer of a given size.
+func nextSize(size int) int {
+       const min = 4 << 10 // 4K: minimum buffer size
+       const max = 1 << 20 // 1M: maximum buffer size which is still doubled
+       if size < min {
+               return min
         }
-       s.killLit()
-       return lit
-}
-
-func (s *source) killLit() {
-       s.suf = -1 // no pending literal
+       if size <= max {
+               return size << 1
+       }
+       return size + max
  }
author	Robert Griesemer <gri@golang.org>
	Thu, 27 Feb 2020 05:31:00 +0000 (21:31 -0800)
committer	Robert Griesemer <gri@golang.org>
	Thu, 5 Mar 2020 19:25:46 +0000 (19:25 +0000)
src/cmd/compile/internal/syntax/parser.go		patch \| blob \| history
src/cmd/compile/internal/syntax/scanner.go		patch \| blob \| history
src/cmd/compile/internal/syntax/scanner_test.go		patch \| blob \| history
src/cmd/compile/internal/syntax/source.go		patch \| blob \| history