// Go source. After initialization, consecutive calls of
// next advance the scanner one token at a time.
//
-// This file, source.go, and tokens.go are self-contained
-// (go tool compile scanner.go source.go tokens.go compiles)
-// and thus could be made into its own package.
+// This file, source.go, tokens.go, and token_string.go are self-contained
+// (`go tool compile scanner.go source.go tokens.go token_string.go` compiles)
+// and thus could be made into their own package.
package syntax
redo:
// skip white space
- c := s.getr()
- for c == ' ' || c == '\t' || c == '\n' && !nlsemi || c == '\r' {
- c = s.getr()
+ s.stop()
+ for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !nlsemi || s.ch == '\r' {
+ s.nextch()
}
// token start
- s.line, s.col = s.source.line0, s.source.col0
-
- if isLetter(c) || c >= utf8.RuneSelf && s.isIdentRune(c, true) {
+ s.line, s.col = s.pos()
+ s.start()
+ if isLetter(s.ch) || s.ch >= utf8.RuneSelf && s.atIdentChar(true) {
+ s.nextch()
s.ident()
return
}
- switch c {
+ switch s.ch {
case -1:
if nlsemi {
s.lit = "EOF"
s.tok = _EOF
case '\n':
+ s.nextch()
s.lit = "newline"
s.tok = _Semi
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
- s.number(c)
+ s.number(false)
case '"':
s.stdString()
s.rune()
case '(':
+ s.nextch()
s.tok = _Lparen
case '[':
+ s.nextch()
s.tok = _Lbrack
case '{':
+ s.nextch()
s.tok = _Lbrace
case ',':
+ s.nextch()
s.tok = _Comma
case ';':
+ s.nextch()
s.lit = "semicolon"
s.tok = _Semi
case ')':
+ s.nextch()
s.nlsemi = true
s.tok = _Rparen
case ']':
+ s.nextch()
s.nlsemi = true
s.tok = _Rbrack
case '}':
+ s.nextch()
s.nlsemi = true
s.tok = _Rbrace
case ':':
- if s.getr() == '=' {
+ s.nextch()
+ if s.ch == '=' {
+ s.nextch()
s.tok = _Define
break
}
- s.ungetr()
s.tok = _Colon
case '.':
- c = s.getr()
- if isDecimal(c) {
- s.ungetr()
- s.unread(1) // correct position of '.' (needed by startLit in number)
- s.number('.')
+ s.nextch()
+ if isDecimal(s.ch) {
+ s.number(true)
break
}
- if c == '.' {
- c = s.getr()
- if c == '.' {
+ if s.ch == '.' {
+ s.nextch()
+ if s.ch == '.' {
+ s.nextch()
s.tok = _DotDotDot
break
}
- s.unread(1)
+ s.rewind() // now s.ch holds 1st '.'
+ s.nextch() // consume 1st '.' again
}
- s.ungetr()
s.tok = _Dot
case '+':
+ s.nextch()
s.op, s.prec = Add, precAdd
- c = s.getr()
- if c != '+' {
+ if s.ch != '+' {
goto assignop
}
+ s.nextch()
s.nlsemi = true
s.tok = _IncOp
case '-':
+ s.nextch()
s.op, s.prec = Sub, precAdd
- c = s.getr()
- if c != '-' {
+ if s.ch != '-' {
goto assignop
}
+ s.nextch()
s.nlsemi = true
s.tok = _IncOp
case '*':
+ s.nextch()
s.op, s.prec = Mul, precMul
// don't goto assignop - want _Star token
- if s.getr() == '=' {
+ if s.ch == '=' {
+ s.nextch()
s.tok = _AssignOp
break
}
- s.ungetr()
s.tok = _Star
case '/':
- c = s.getr()
- if c == '/' {
+ s.nextch()
+ if s.ch == '/' {
+ s.nextch()
s.lineComment()
goto redo
}
- if c == '*' {
+ if s.ch == '*' {
+ s.nextch()
s.fullComment()
- if s.source.line > s.line && nlsemi {
+ if line, _ := s.pos(); line > s.line && nlsemi {
// A multi-line comment acts like a newline;
// it translates to a ';' if nlsemi is set.
s.lit = "newline"
goto assignop
case '%':
+ s.nextch()
s.op, s.prec = Rem, precMul
- c = s.getr()
goto assignop
case '&':
- c = s.getr()
- if c == '&' {
+ s.nextch()
+ if s.ch == '&' {
+ s.nextch()
s.op, s.prec = AndAnd, precAndAnd
s.tok = _Operator
break
}
s.op, s.prec = And, precMul
- if c == '^' {
+ if s.ch == '^' {
+ s.nextch()
s.op = AndNot
- c = s.getr()
}
goto assignop
case '|':
- c = s.getr()
- if c == '|' {
+ s.nextch()
+ if s.ch == '|' {
+ s.nextch()
s.op, s.prec = OrOr, precOrOr
s.tok = _Operator
break
goto assignop
case '^':
+ s.nextch()
s.op, s.prec = Xor, precAdd
- c = s.getr()
goto assignop
case '<':
- c = s.getr()
- if c == '=' {
+ s.nextch()
+ if s.ch == '=' {
+ s.nextch()
s.op, s.prec = Leq, precCmp
s.tok = _Operator
break
}
- if c == '<' {
+ if s.ch == '<' {
+ s.nextch()
s.op, s.prec = Shl, precMul
- c = s.getr()
goto assignop
}
- if c == '-' {
+ if s.ch == '-' {
+ s.nextch()
s.tok = _Arrow
break
}
- s.ungetr()
s.op, s.prec = Lss, precCmp
s.tok = _Operator
case '>':
- c = s.getr()
- if c == '=' {
+ s.nextch()
+ if s.ch == '=' {
+ s.nextch()
s.op, s.prec = Geq, precCmp
s.tok = _Operator
break
}
- if c == '>' {
+ if s.ch == '>' {
+ s.nextch()
s.op, s.prec = Shr, precMul
- c = s.getr()
goto assignop
}
- s.ungetr()
s.op, s.prec = Gtr, precCmp
s.tok = _Operator
case '=':
- if s.getr() == '=' {
+ s.nextch()
+ if s.ch == '=' {
+ s.nextch()
s.op, s.prec = Eql, precCmp
s.tok = _Operator
break
}
- s.ungetr()
s.tok = _Assign
case '!':
- if s.getr() == '=' {
+ s.nextch()
+ if s.ch == '=' {
+ s.nextch()
s.op, s.prec = Neq, precCmp
s.tok = _Operator
break
}
- s.ungetr()
s.op, s.prec = Not, 0
s.tok = _Operator
default:
- s.tok = 0
- s.errorf("invalid character %#U", c)
+ s.errorf("invalid character %#U", s.ch)
+ s.nextch()
goto redo
}
return
assignop:
- if c == '=' {
+ if s.ch == '=' {
+ s.nextch()
s.tok = _AssignOp
return
}
- s.ungetr()
s.tok = _Operator
}
-func isLetter(c rune) bool {
- return 'a' <= lower(c) && lower(c) <= 'z' || c == '_'
-}
-
func (s *scanner) ident() {
- s.startLit()
-
// accelerate common case (7bit ASCII)
- c := s.getr()
- for isLetter(c) || isDecimal(c) {
- c = s.getr()
+ for isLetter(s.ch) || isDecimal(s.ch) {
+ s.nextch()
}
// general case
- if c >= utf8.RuneSelf {
- for s.isIdentRune(c, false) {
- c = s.getr()
+ if s.ch >= utf8.RuneSelf {
+ for s.atIdentChar(false) {
+ s.nextch()
}
}
- s.ungetr()
-
- lit := s.stopLit()
// possibly a keyword
+ lit := s.segment()
if len(lit) >= 2 {
if tok := keywordMap[hash(lit)]; tok != 0 && tokStrFast(tok) == string(lit) {
s.nlsemi = contains(1<<_Break|1<<_Continue|1<<_Fallthrough|1<<_Return, tok)
return _token_name[_token_index[tok-1]:_token_index[tok]]
}
-func (s *scanner) isIdentRune(c rune, first bool) bool {
+func (s *scanner) atIdentChar(first bool) bool {
switch {
- case unicode.IsLetter(c) || c == '_':
+ case unicode.IsLetter(s.ch) || s.ch == '_':
// ok
- case unicode.IsDigit(c):
+ case unicode.IsDigit(s.ch):
if first {
- s.errorf("identifier cannot begin with digit %#U", c)
+ s.errorf("identifier cannot begin with digit %#U", s.ch)
}
- case c >= utf8.RuneSelf:
- s.errorf("invalid character %#U in identifier", c)
+ case s.ch >= utf8.RuneSelf:
+ s.errorf("invalid character %#U in identifier", s.ch)
default:
return false
}
}
}
-func lower(c rune) rune { return ('a' - 'A') | c } // returns lower-case c iff c is ASCII letter
-func isDecimal(c rune) bool { return '0' <= c && c <= '9' }
-func isHex(c rune) bool { return '0' <= c && c <= '9' || 'a' <= lower(c) && lower(c) <= 'f' }
+func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
+func isLetter(ch rune) bool { return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' }
+func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
+func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
-// digits accepts the sequence { digit | '_' } starting with c0.
+// digits accepts the sequence { digit | '_' }.
// If base <= 10, digits accepts any decimal digit but records
// the index (relative to the literal start) of a digit >= base
// in *invalid, if *invalid < 0.
-// digits returns the first rune that is not part of the sequence
-// anymore, and a bitset describing whether the sequence contained
+// digits returns a bitset describing whether the sequence contained
// digits (bit 0 is set), or separators '_' (bit 1 is set).
-func (s *scanner) digits(c0 rune, base int, invalid *int) (c rune, digsep int) {
- c = c0
+func (s *scanner) digits(base int, invalid *int) (digsep int) {
if base <= 10 {
max := rune('0' + base)
- for isDecimal(c) || c == '_' {
+ for isDecimal(s.ch) || s.ch == '_' {
ds := 1
- if c == '_' {
+ if s.ch == '_' {
ds = 2
- } else if c >= max && *invalid < 0 {
- *invalid = int(s.col0 - s.col) // record invalid rune index
+ } else if s.ch >= max && *invalid < 0 {
+ _, col := s.pos()
+ *invalid = int(col - s.col) // record invalid rune index
}
digsep |= ds
- c = s.getr()
+ s.nextch()
}
} else {
- for isHex(c) || c == '_' {
+ for isHex(s.ch) || s.ch == '_' {
ds := 1
- if c == '_' {
+ if s.ch == '_' {
ds = 2
}
digsep |= ds
- c = s.getr()
+ s.nextch()
}
}
return
}
-func (s *scanner) number(c rune) {
- s.startLit()
+func (s *scanner) number(seenPoint bool) {
s.bad = false
base := 10 // number base
invalid := -1 // index of invalid digit in literal, or < 0
// integer part
- var ds int
- if c != '.' {
+ if !seenPoint {
s.kind = IntLit
- if c == '0' {
- c = s.getr()
- switch lower(c) {
+ if s.ch == '0' {
+ s.nextch()
+ switch lower(s.ch) {
case 'x':
- c = s.getr()
+ s.nextch()
base, prefix = 16, 'x'
case 'o':
- c = s.getr()
+ s.nextch()
base, prefix = 8, 'o'
case 'b':
- c = s.getr()
+ s.nextch()
base, prefix = 2, 'b'
default:
base, prefix = 8, '0'
digsep = 1 // leading 0
}
}
- c, ds = s.digits(c, base, &invalid)
- digsep |= ds
+ digsep |= s.digits(base, &invalid)
+ if s.ch == '.' {
+ if prefix == 'o' || prefix == 'b' {
+ s.errorf("invalid radix point in %s", litname(prefix))
+ }
+ s.nextch()
+ seenPoint = true
+ }
}
// fractional part
- if c == '.' {
+ if seenPoint {
s.kind = FloatLit
- if prefix == 'o' || prefix == 'b' {
- s.errorf("invalid radix point in %s", litname(prefix))
- }
- c, ds = s.digits(s.getr(), base, &invalid)
- digsep |= ds
+ digsep |= s.digits(base, &invalid)
}
if digsep&1 == 0 && !s.bad {
}
// exponent
- if e := lower(c); e == 'e' || e == 'p' {
+ if e := lower(s.ch); e == 'e' || e == 'p' {
if !s.bad {
switch {
case e == 'e' && prefix != 0 && prefix != '0':
- s.errorf("%q exponent requires decimal mantissa", c)
+ s.errorf("%q exponent requires decimal mantissa", s.ch)
case e == 'p' && prefix != 'x':
- s.errorf("%q exponent requires hexadecimal mantissa", c)
+ s.errorf("%q exponent requires hexadecimal mantissa", s.ch)
}
}
- c = s.getr()
+ s.nextch()
s.kind = FloatLit
- if c == '+' || c == '-' {
- c = s.getr()
+ if s.ch == '+' || s.ch == '-' {
+ s.nextch()
}
- c, ds = s.digits(c, 10, nil)
- digsep |= ds
- if ds&1 == 0 && !s.bad {
+ digsep = s.digits(10, nil) | digsep&2 // don't lose sep bit
+ if digsep&1 == 0 && !s.bad {
s.errorf("exponent has no digits")
}
} else if prefix == 'x' && s.kind == FloatLit && !s.bad {
}
// suffix 'i'
- if c == 'i' {
+ if s.ch == 'i' {
s.kind = ImagLit
- c = s.getr()
+ s.nextch()
}
- s.ungetr()
s.nlsemi = true
- s.lit = string(s.stopLit())
+ s.lit = string(s.segment())
s.tok = _Literal
if s.kind == IntLit && invalid >= 0 && !s.bad {
}
func (s *scanner) rune() {
- s.startLit()
s.bad = false
+ s.nextch()
n := 0
for ; ; n++ {
- r := s.getr()
- if r == '\'' {
+ if s.ch == '\'' {
+ if !s.bad {
+ if n == 0 {
+ s.errorf("empty rune literal or unescaped '")
+ } else if n != 1 {
+ s.errorAtf(0, "more than one character in rune literal")
+ }
+ }
+ s.nextch()
break
}
- if r == '\\' {
+ if s.ch == '\\' {
+ s.nextch()
s.escape('\'')
continue
}
- if r == '\n' {
- s.ungetr() // assume newline is not part of literal
+ if s.ch == '\n' {
if !s.bad {
s.errorf("newline in rune literal")
}
break
}
- if r < 0 {
+ if s.ch < 0 {
if !s.bad {
s.errorAtf(0, "rune literal not terminated")
}
break
}
- }
-
- if !s.bad {
- if n == 0 {
- s.errorf("empty rune literal or unescaped '")
- } else if n != 1 {
- s.errorAtf(0, "more than one character in rune literal")
- }
+ s.nextch()
}
s.nlsemi = true
- s.lit = string(s.stopLit())
+ s.lit = string(s.segment())
s.kind = RuneLit
s.tok = _Literal
}
func (s *scanner) stdString() {
- s.startLit()
s.bad = false
+ s.nextch()
for {
- r := s.getr()
- if r == '"' {
+ if s.ch == '"' {
+ s.nextch()
break
}
- if r == '\\' {
+ if s.ch == '\\' {
+ s.nextch()
s.escape('"')
continue
}
- if r == '\n' {
- s.ungetr() // assume newline is not part of literal
+ if s.ch == '\n' {
s.errorf("newline in string")
break
}
- if r < 0 {
+ if s.ch < 0 {
s.errorAtf(0, "string not terminated")
break
}
+ s.nextch()
}
s.nlsemi = true
- s.lit = string(s.stopLit())
+ s.lit = string(s.segment())
s.kind = StringLit
s.tok = _Literal
}
func (s *scanner) rawString() {
- s.startLit()
s.bad = false
+ s.nextch()
for {
- r := s.getr()
- if r == '`' {
+ if s.ch == '`' {
+ s.nextch()
break
}
- if r < 0 {
+ if s.ch < 0 {
s.errorAtf(0, "string not terminated")
break
}
+ s.nextch()
}
// We leave CRs in the string since they are part of the
// literal (even though they are not part of the literal
// value).
s.nlsemi = true
- s.lit = string(s.stopLit())
+ s.lit = string(s.segment())
s.kind = StringLit
s.tok = _Literal
}
func (s *scanner) comment(text string) {
- s.errh(s.line, s.col, text)
+ s.errorAtf(0, text)
}
-func (s *scanner) skipLine(r rune) {
- for r >= 0 {
- if r == '\n' {
- s.ungetr() // don't consume '\n' - needed for nlsemi logic
- break
- }
- r = s.getr()
+func (s *scanner) skipLine() {
+ // don't consume '\n' - needed for nlsemi logic
+ for s.ch >= 0 && s.ch != '\n' {
+ s.nextch()
}
}
func (s *scanner) lineComment() {
- r := s.getr()
+ // opening has already been consumed
if s.mode&comments != 0 {
- s.startLit()
- s.skipLine(r)
- s.comment("//" + string(s.stopLit()))
+ s.skipLine()
+ s.comment(string(s.segment()))
return
}
// directives must start at the beginning of the line (s.col == colbase)
- if s.mode&directives == 0 || s.col != colbase || (r != 'g' && r != 'l') {
- s.skipLine(r)
+ if s.mode&directives == 0 || s.col != colbase || (s.ch != 'g' && s.ch != 'l') {
+ s.stop()
+ s.skipLine()
return
}
// recognize go: or line directives
prefix := "go:"
- if r == 'l' {
+ if s.ch == 'l' {
prefix = "line "
}
for _, m := range prefix {
- if r != m {
- s.skipLine(r)
+ if s.ch != m {
+ s.stop()
+ s.skipLine()
return
}
- r = s.getr()
+ s.nextch()
}
// directive text
- s.startLit()
- s.skipLine(r)
- s.comment("//" + prefix + string(s.stopLit()))
+ s.skipLine()
+ s.comment(string(s.segment()))
}
-func (s *scanner) skipComment(r rune) bool {
- for r >= 0 {
- for r == '*' {
- r = s.getr()
- if r == '/' {
+func (s *scanner) skipComment() bool {
+ for s.ch >= 0 {
+ for s.ch == '*' {
+ s.nextch()
+ if s.ch == '/' {
+ s.nextch()
return true
}
}
- r = s.getr()
+ s.nextch()
}
s.errorAtf(0, "comment not terminated")
return false
}
func (s *scanner) fullComment() {
- r := s.getr()
+ /* opening has already been consumed */
if s.mode&comments != 0 {
- s.startLit()
- if s.skipComment(r) {
- s.comment("/*" + string(s.stopLit()))
- } else {
- s.killLit() // not a complete comment - ignore
+ if s.skipComment() {
+ s.comment(string(s.segment()))
}
return
}
- if s.mode&directives == 0 || r != 'l' {
- s.skipComment(r)
+ if s.mode&directives == 0 || s.ch != 'l' {
+ s.stop()
+ s.skipComment()
return
}
// recognize line directive
const prefix = "line "
for _, m := range prefix {
- if r != m {
- s.skipComment(r)
+ if s.ch != m {
+ s.stop()
+ s.skipComment()
return
}
- r = s.getr()
+ s.nextch()
}
// directive text
- s.startLit()
- if s.skipComment(r) {
- s.comment("/*" + prefix + string(s.stopLit()))
- } else {
- s.killLit() // not a complete comment - ignore
+ if s.skipComment() {
+ s.comment(string(s.segment()))
}
}
var n int
var base, max uint32
- c := s.getr()
- switch c {
- case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
+ switch s.ch {
+ case quote, 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\':
+ s.nextch()
return
case '0', '1', '2', '3', '4', '5', '6', '7':
n, base, max = 3, 8, 255
case 'x':
- c = s.getr()
+ s.nextch()
n, base, max = 2, 16, 255
case 'u':
- c = s.getr()
+ s.nextch()
n, base, max = 4, 16, unicode.MaxRune
case 'U':
- c = s.getr()
+ s.nextch()
n, base, max = 8, 16, unicode.MaxRune
default:
- if c < 0 {
+ if s.ch < 0 {
return // complain in caller about EOF
}
s.errorf("unknown escape")
var x uint32
for i := n; i > 0; i-- {
+ if s.ch < 0 {
+ return // complain in caller about EOF
+ }
d := base
- switch {
- case isDecimal(c):
- d = uint32(c) - '0'
- case 'a' <= lower(c) && lower(c) <= 'f':
- d = uint32(lower(c)) - ('a' - 10)
+ if isDecimal(s.ch) {
+ d = uint32(s.ch) - '0'
+ } else if 'a' <= lower(s.ch) && lower(s.ch) <= 'f' {
+ d = uint32(lower(s.ch)) - 'a' + 10
}
if d >= base {
- if c < 0 {
- return // complain in caller about EOF
- }
kind := "hex"
if base == 8 {
kind = "octal"
}
- s.errorf("invalid character %q in %s escape", c, kind)
- s.ungetr()
+ s.errorf("invalid character %q in %s escape", s.ch, kind)
return
}
// d < base
x = x*base + d
- c = s.getr()
+ s.nextch()
}
- s.ungetr()
if x > max && base == 8 {
s.errorf("octal escape value %d > 255", x)
// license that can be found in the LICENSE file.
// This file implements source, a buffered rune reader
-// which is specialized for the needs of the Go scanner:
-// Contiguous sequences of runes (literals) are extracted
-// directly as []byte without the need to re-encode the
-// runes in UTF-8 (as would be necessary with bufio.Reader).
-//
+// specialized for scanning Go code: Reading
+// ASCII characters, maintaining current (line, col)
+// position information, and recording of the most
+// recently read source segment are highly optimized.
// This file is self-contained (go tool compile source.go
// compiles) and thus could be made into its own package.
"unicode/utf8"
)
-// starting points for line and column numbers
-const linebase = 1
-const colbase = 1
-
-// max. number of bytes to unread
-const maxunread = 10
-
-// buf [...read...|...|...unread...|s|...free...]
-// ^ ^ ^ ^
-// | | | |
-// suf r0 r w
+// The source buffer is accessed using three indices b (begin),
+// r (read), and e (end):
+//
+// - If b >= 0, it points to the beginning of a segment of most
+// recently read characters (typically a Go literal).
+//
+// - r points to the byte immediately following the most recently
+// read character ch, which starts at r-chw.
+//
+// - e points to the byte immediately following the last byte that
+// was read into the buffer.
+//
+// The buffer content is terminated at buf[e] with the sentinel
+// character utf8.RuneSelf. This makes it possible to test for
+// the common case of ASCII characters with a single 'if' (see
+// nextch method).
+//
+// +------ content in use -------+
+// v v
+// buf [...read...|...segment...|ch|...unread...|s|...free...]
+// ^ ^ ^ ^
+// | | | |
+// b r-chw r e
+//
+// Invariant: -1 <= b < r <= e < len(buf) && buf[e] == sentinel
type source struct {
- src io.Reader
- errh func(line, pos uint, msg string)
-
- // source buffer
- buf [4 << 10]byte
- r0, r, w int // previous/current read and write buf positions, excluding sentinel
- line0, line uint // previous/current line
- col0, col uint // previous/current column (byte offsets from line start)
- ioerr error // pending io error
-
- // literal buffer
- lit []byte // literal prefix
- suf int // literal suffix; suf >= 0 means we are scanning a literal
+ in io.Reader
+ errh func(line, col uint, msg string)
+
+ buf []byte // source buffer
+ ioerr error // pending I/O error, or nil
+ b, r, e int // buffer indices (see comment above)
+ line, col uint // source position of ch (0-based)
+ ch rune // most recently read character
+ chw int // width of ch
}
-// init initializes source to read from src and to report errors via errh.
-// errh must not be nil.
-func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
- s.src = src
+const sentinel = utf8.RuneSelf
+
+func (s *source) init(in io.Reader, errh func(line, col uint, msg string)) {
+ s.in = in
s.errh = errh
- s.buf[0] = utf8.RuneSelf // terminate with sentinel
- s.r0, s.r, s.w = 0, 0, 0
- s.line0, s.line = 0, linebase
- s.col0, s.col = 0, colbase
+ if s.buf == nil {
+ s.buf = make([]byte, nextSize(0))
+ }
+ s.buf[0] = sentinel
s.ioerr = nil
-
- s.lit = s.lit[:0]
- s.suf = -1
+ s.b, s.r, s.e = -1, 0, 0
+ s.line, s.col = 0, 0
+ s.ch = ' '
+ s.chw = 0
}
-// ungetr sets the reading position to a previous reading
-// position, usually the one of the most recently read
-// rune, but possibly earlier (see unread below).
-func (s *source) ungetr() {
- s.r, s.line, s.col = s.r0, s.line0, s.col0
-}
+// starting points for line and column numbers
+const linebase = 1
+const colbase = 1
-// unread moves the previous reading position to a position
-// that is n bytes earlier in the source. The next ungetr
-// call will set the reading position to that moved position.
-// The "unread" runes must be single byte and not contain any
-// newlines; and 0 <= n <= maxunread must hold.
-func (s *source) unread(n int) {
- s.r0 -= n
- s.col0 -= uint(n)
+// pos returns the (line, col) source position of s.ch.
+func (s *source) pos() (line, col uint) {
+ return linebase + s.line, colbase + s.col
}
+// error reports the error msg at source position s.pos().
func (s *source) error(msg string) {
- s.errh(s.line0, s.col0, msg)
+ line, col := s.pos()
+ s.errh(line, col, msg)
}
-// getr reads and returns the next rune.
-//
-// If a read or source encoding error occurs, getr
-// calls the error handler installed with init.
-// The handler must exist.
-//
-// The (line, col) position passed to the error handler
-// is always at the current source reading position.
-func (s *source) getr() rune {
-redo:
- s.r0, s.line0, s.col0 = s.r, s.line, s.col
-
- // We could avoid at least one test that is always taken in the
- // for loop below by duplicating the common case code (ASCII)
- // here since we always have at least the sentinel (utf8.RuneSelf)
- // in the buffer. Measure and optimize if necessary.
+// start starts a new active source segment (including s.ch).
+// As long as stop has not been called, the active segment's
+// bytes (excluding s.ch) may be retrieved by calling segment.
+func (s *source) start() { s.b = s.r - s.chw }
+func (s *source) stop() { s.b = -1 }
+func (s *source) segment() []byte { return s.buf[s.b : s.r-s.chw] }
+
+// rewind rewinds the scanner's read position and character s.ch
+// to the start of the currently active segment, which must not
+// contain any newlines (otherwise position information will be
+// incorrect). Currently, rewind is only needed for handling the
+// source sequence ".."; it must not be called outside an active
+// segment.
+func (s *source) rewind() {
+ // ok to verify precondition - rewind is rarely called
+ if s.b < 0 {
+ panic("no active segment")
+ }
+ s.col -= uint(s.r - s.b)
+ s.r = s.b
+ s.nextch()
+}
- // make sure we have at least one rune in buffer, or we are at EOF
- for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
- s.fill() // s.w-s.r < len(s.buf) => buffer is not full
+func (s *source) nextch() {
+redo:
+ s.col += uint(s.chw)
+ if s.ch == '\n' {
+ s.line++
+ s.col = 0
}
- // common case: ASCII and enough bytes
- // (invariant: s.buf[s.w] == utf8.RuneSelf)
- if b := s.buf[s.r]; b < utf8.RuneSelf {
+ // fast common case: at least one ASCII character
+ if s.ch = rune(s.buf[s.r]); s.ch < sentinel {
s.r++
- // TODO(gri) Optimization: Instead of adjusting s.col for each character,
- // remember the line offset instead and then compute the offset as needed
- // (which is less often).
- s.col++
- if b == 0 {
+ s.chw = 1
+ if s.ch == 0 {
s.error("invalid NUL character")
goto redo
}
- if b == '\n' {
- s.line++
- s.col = colbase
- }
- return rune(b)
+ return
+ }
+
+ // slower general case: add more bytes to buffer if we don't have a full rune
+ for s.e-s.r < utf8.UTFMax && !utf8.FullRune(s.buf[s.r:s.e]) && s.ioerr == nil {
+ s.fill()
}
// EOF
- if s.r == s.w {
+ if s.r == s.e {
if s.ioerr != io.EOF {
// ensure we never start with a '/' (e.g., rooted path) in the error message
s.error("I/O error: " + s.ioerr.Error())
+ s.ioerr = nil
}
- return -1
+ s.ch = -1
+ s.chw = 0
+ return
}
- // uncommon case: not ASCII
- r, w := utf8.DecodeRune(s.buf[s.r:s.w])
- s.r += w
- s.col += uint(w)
+ s.ch, s.chw = utf8.DecodeRune(s.buf[s.r:s.e])
+ s.r += s.chw
- if r == utf8.RuneError && w == 1 {
+ if s.ch == utf8.RuneError && s.chw == 1 {
s.error("invalid UTF-8 encoding")
goto redo
}
// BOM's are only allowed as the first character in a file
const BOM = 0xfeff
- if r == BOM {
- if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to maxunread)
+ if s.ch == BOM {
+ if s.line > 0 || s.col > 0 {
s.error("invalid BOM in the middle of the file")
}
goto redo
}
-
- return r
}
+// fill reads more source bytes into s.buf.
+// It returns with at least one more byte in the buffer, or with s.ioerr != nil.
func (s *source) fill() {
- // Slide unread bytes to beginning but preserve last read char
- // (for one ungetr call) plus maxunread extra bytes (for one
- // unread call).
- if s.r0 > maxunread {
- n := s.r0 - maxunread // number of bytes to slide down
- // save literal prefix, if any
- // (make sure we keep maxunread bytes and the last
- // read char in the buffer)
- if s.suf >= 0 {
- // we have a literal
- if s.suf < n {
- // save literal prefix
- s.lit = append(s.lit, s.buf[s.suf:n]...)
- s.suf = 0
- } else {
- s.suf -= n
- }
- }
- copy(s.buf[:], s.buf[n:s.w])
- s.r0 = maxunread // eqv: s.r0 -= n
- s.r -= n
- s.w -= n
+ // determine content to preserve
+ b := s.r
+ if s.b >= 0 {
+ b = s.b
+ s.b = 0 // after buffer has grown or content has been moved down
}
+ content := s.buf[b:s.e]
+
+ // grow buffer or move content down
+ if len(content)*2 > len(s.buf) {
+ s.buf = make([]byte, nextSize(len(s.buf)))
+ copy(s.buf, content)
+ } else if b > 0 {
+ copy(s.buf, content)
+ }
+ s.r -= b
+ s.e -= b
// read more data: try a limited number of times
- for i := 100; i > 0; i-- {
- n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
+ for i := 0; i < 10; i++ {
+ var n int
+ n, s.ioerr = s.in.Read(s.buf[s.e : len(s.buf)-1]) // -1 to leave space for sentinel
if n < 0 {
panic("negative read") // incorrect underlying io.Reader implementation
}
- s.w += n
- if n > 0 || err != nil {
- s.buf[s.w] = utf8.RuneSelf // sentinel
- if err != nil {
- s.ioerr = err
- }
+ if n > 0 || s.ioerr != nil {
+ s.e += n
+ s.buf[s.e] = sentinel
return
}
+ // n == 0
}
- s.buf[s.w] = utf8.RuneSelf // sentinel
+ s.buf[s.e] = sentinel
s.ioerr = io.ErrNoProgress
}
-func (s *source) startLit() {
- s.suf = s.r0
- s.lit = s.lit[:0] // reuse lit
-}
-
-func (s *source) stopLit() []byte {
- lit := s.buf[s.suf:s.r]
- if len(s.lit) > 0 {
- lit = append(s.lit, lit...)
+// nextSize returns the next bigger size for a buffer of a given size.
+func nextSize(size int) int {
+ const min = 4 << 10 // 4K: minimum buffer size
+ const max = 1 << 20 // 1M: maximum buffer size which is still doubled
+ if size < min {
+ return min
}
- s.killLit()
- return lit
-}
-
-func (s *source) killLit() {
- s.suf = -1 // no pending literal
+ if size <= max {
+ return size << 1
+ }
+ return size + max
}