Light-weight scanner for general use.

author Robert Griesemer <gri@golang.org>

Wed, 13 Jan 2010 01:04:45 +0000 (17:04 -0800)

committer Robert Griesemer <gri@golang.org>

Wed, 13 Jan 2010 01:04:45 +0000 (17:04 -0800)
author Robert Griesemer <gri@golang.org>
Wed, 13 Jan 2010 01:04:45 +0000 (17:04 -0800)
committer Robert Griesemer <gri@golang.org>
Wed, 13 Jan 2010 01:04:45 +0000 (17:04 -0800)
diff --git a/src/pkg/Makefile b/src/pkg/Makefile

index 4f001db855c2e0b8b9e3556ae965e25cc03f86a9..034a66bb6e3ee830693ea882184df50d6b0e1acf 100644 (file)
--- a/src/pkg/Makefile
+++ b/src/pkg/Makefile
@@ -99,6 +99,7 @@ DIRS=\
         regexp\
         rpc\
         runtime\
+       scanner\
         sort\
         strconv\
         strings\
diff --git a/src/pkg/scanner/Makefile b/src/pkg/scanner/Makefile

new file mode 100644 (file)

index 0000000..8ac16d6
--- /dev/null
+++ b/src/pkg/scanner/Makefile
@@ -0,0 +1,11 @@
+# Copyright 2009 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include ../../Make.$(GOARCH)
+
+TARG=scanner
+GOFILES=\
+       scanner.go\
+
+include ../../Make.pkg
diff --git a/src/pkg/scanner/scanner.go b/src/pkg/scanner/scanner.go

new file mode 100644 (file)

index 0000000..c4233aa
--- /dev/null
+++ b/src/pkg/scanner/scanner.go
@@ -0,0 +1,626 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// A general-purpose scanner for text. Takes an io.Reader
+// providing the source which then can be tokenized through
+// repeated calls to the Scan function.
+//
+// By default, a Scanner skips white space and comments and
+// recognizes literals as defined by the Go language spec.
+// It may be customized to recognize only a subset of those
+// literals and to recognize different white space characters.
+//
+// Basic usage pattern:
+//
+//     var s scanner.Scanner
+//     s.Init(src)
+//     tok := s.Scan()
+//     for tok != scanner.EOF {
+//             // do something with tok
+//             tok = s.Scan()
+//     }
+//
+package scanner
+
+import (
+       "bytes"
+       "fmt"
+       "io"
+       "os"
+       "unicode"
+       "utf8"
+)
+
+
+// A source position is represented by a Position value.
+// A position is valid if Line > 0.
+type Position struct {
+       Filename string // filename, if any
+       Offset   int    // byte offset, starting at 0
+       Line     int    // line number, starting at 1
+       Column   int    // column number, starting at 0 (character count per line)
+}
+
+
+// IsValid returns true if the position is valid.
+func (pos *Position) IsValid() bool { return pos.Line > 0 }
+
+
+func (pos Position) String() string {
+       s := pos.Filename
+       if pos.IsValid() {
+               if s != "" {
+                       s += ":"
+               }
+               s += fmt.Sprintf("%d:%d", pos.Line, pos.Column)
+       }
+       if s == "" {
+               s = "???"
+       }
+       return s
+}
+
+
+// Predefined mode bits to control recognition of tokens. For instance,
+// to configure a Scanner such that it only recognizes (Go) identifiers,
+// integers, and skips comments, set the Scanner's Mode field to:
+//
+//     ScanIdents | ScanInts | SkipComments
+//
+const (
+       ScanIdents     = 1 << -Ident
+       ScanInts       = 1 << -Int
+       ScanFloats     = 1 << -Float // includes Ints
+       ScanChars      = 1 << -Char
+       ScanStrings    = 1 << -String
+       ScanRawStrings = 1 << -RawString
+       ScanComments   = 1 << -Comment
+       SkipComments   = 1 << -skipComment // if set with ScanComments, comments become white space
+       GoTokens       = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
+)
+
+
+// The result of Scan is one of the following tokens or a Unicode character.
+const (
+       EOF = -(iota + 1)
+       Ident
+       Int
+       Float
+       Char
+       String
+       RawString
+       Comment
+       skipComment
+)
+
+
+var tokenString = map[int]string{
+       EOF: "EOF",
+       Ident: "Ident",
+       Int: "Int",
+       Float: "Float",
+       Char: "Char",
+       String: "String",
+       RawString: "RawString",
+       Comment: "Comment",
+}
+
+
+// TokenString returns a (visible) string for a token or Unicode character.
+func TokenString(tok int) string {
+       if s, found := tokenString[tok]; found {
+               return s
+       }
+       return fmt.Sprintf("U+%04X", tok)
+}
+
+
+// GoWhitespace is the default value for the Scanner's Whitespace field.
+// Its value selects Go's white space characters.
+const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
+
+
+const bufLen = 1024 // at least utf8.UTFMax
+
+// A Scanner implements reading of Unicode characters and tokens from an io.Reader.
+type Scanner struct {
+       // Input
+       src io.Reader
+
+       // Source buffer
+       srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
+       srcPos int              // reading position (srcBuf index)
+       srcEnd int              // source end (srcBuf index)
+
+       // Source position
+       srcBufOffset int // byte offset of srcBuf[0] in source
+       line         int // newline count + 1
+       column       int // character count on line
+
+       // Token text buffer
+       // Typically, token text is stored completely in srcBuf, but in general
+       // the token text's head may be buffered in tokBuf while the token text's
+       // tail is stored in srcBuf.
+       tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
+       tokPos int          // token text tail position (srcBuf index)
+       tokEnd int          // token text tail end (srcBuf index)
+
+       // One character look-ahead
+       ch int // character before current srcPos
+
+       // Error is called for each error encountered. If no Error
+       // function is set, the error is reported to os.Stderr.
+       Error func(s *Scanner, msg string)
+
+       // ErrorCount is incremented by one for each error encountered.
+       ErrorCount int
+
+       // The Mode field controls which tokens are recognized. For instance,
+       // to recognize Ints, set the (1<<-Int) bit in Mode. The field may be
+       // changed at any time.
+       Mode uint
+
+       // The Whitespace field controls which characters are recognized
+       // as white space. To recognize a character ch <= ' ' as white space,
+       // set the ch'th bit in Whitespace (the Scanner's behavior is undefined
+       // for values ch > ' '). The field may be changed at any time.
+       Whitespace uint64
+
+       // Current token position. The Offset, Line, and Column fields
+       // are set by Scan(); the Filename field is left untouched by the
+       // Scanner.
+       Position
+}
+
+
+// Init initializes a Scanner with a new source and returns itself.
+// Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
+// and Whitespace is set to GoWhitespace.
+func (s *Scanner) Init(src io.Reader) *Scanner {
+       s.src = src
+
+       // initialize source buffer
+       s.srcBuf[0] = utf8.RuneSelf // sentinel
+       s.srcPos = 0
+       s.srcEnd = 0
+
+       // initialize source position
+       s.srcBufOffset = 0
+       s.line = 1
+       s.column = 0
+
+       // initialize token text buffer
+       s.tokPos = -1
+
+       // initialize one character look-ahead
+       s.ch = s.next()
+
+       // initialize public fields
+       s.Error = nil
+       s.ErrorCount = 0
+       s.Mode = GoTokens
+       s.Whitespace = GoWhitespace
+
+       return s
+}
+
+
+// next reads and returns the next Unicode character. It is designed such
+// that only a minimal amount of work needs to be done in the common ASCII
+// case (one test to check for both ASCII and end-of-buffer, and one test
+// to check for newlines).
+func (s *Scanner) next() int {
+       ch := int(s.srcBuf[s.srcPos])
+
+       if ch >= utf8.RuneSelf {
+               // uncommon case: not ASCII or not enough bytes
+               for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
+                       // not enough bytes: read some more, but first
+                       // save away token text if any
+                       if s.tokPos >= 0 {
+                               s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
+                               s.tokPos = 0
+                       }
+                       // move unread bytes to beginning of buffer
+                       copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
+                       s.srcBufOffset += s.srcPos
+                       // read more bytes
+                       i := s.srcEnd - s.srcPos
+                       n, err := s.src.Read(s.srcBuf[i:bufLen])
+                       s.srcEnd = i + n
+                       s.srcPos = 0
+                       s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
+                       if err != nil {
+                               if s.srcEnd == 0 {
+                                       return EOF
+                               }
+                               s.error(err.String())
+                               break
+                       }
+               }
+               // at least one byte
+               ch = int(s.srcBuf[s.srcPos])
+               if ch >= utf8.RuneSelf {
+                       // uncommon case: not ASCII
+                       var width int
+                       ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
+                       s.srcPos += width - 1
+               }
+       }
+
+       s.srcPos++
+       s.column++
+       if ch == '\n' {
+               s.line++
+               s.column = 0
+       }
+
+       return ch
+}
+
+
+// Next reads and returns the next Unicode character.
+// It returns EOF at the end of the source. It reports
+// a read error by calling s.Error, if set, or else
+// prints an error message to os.Stderr. Next does not
+// update the Scanner's Position field; use Pos() to
+// get the current position.
+func (s *Scanner) Next() int {
+       s.tokPos = -1 // don't collect token text
+       ch := s.ch
+       s.ch = s.next()
+       return ch
+}
+
+
+func (s *Scanner) error(msg string) {
+       s.ErrorCount++
+       if s.Error != nil {
+               s.Error(s, msg)
+               return
+       }
+       fmt.Fprintf(os.Stderr, "%s: %s", s.Position, msg)
+}
+
+
+func (s *Scanner) scanIdentifier() int {
+       ch := s.next() // read character after first '_' or letter
+       for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
+               ch = s.next()
+       }
+       return ch
+}
+
+
+func digitVal(ch int) int {
+       switch {
+       case '0' <= ch && ch <= '9':
+               return ch - '0'
+       case 'a' <= ch && ch <= 'f':
+               return ch - 'a' + 10
+       case 'A' <= ch && ch <= 'F':
+               return ch - 'A' + 10
+       }
+       return 16 // larger than any legal digit val
+}
+
+
+func isDecimal(ch int) bool { return '0' <= ch && ch <= '9' }
+
+
+func (s *Scanner) scanMantissa(ch int) int {
+       for isDecimal(ch) {
+               ch = s.next()
+       }
+       return ch
+}
+
+
+func (s *Scanner) scanFraction(ch int) int {
+       if ch == '.' {
+               ch = s.scanMantissa(s.next())
+       }
+       return ch
+}
+
+
+func (s *Scanner) scanExponent(ch int) int {
+       if ch == 'e' || ch == 'E' {
+               ch = s.next()
+               if ch == '-' || ch == '+' {
+                       ch = s.next()
+               }
+               ch = s.scanMantissa(ch)
+       }
+       return ch
+}
+
+
+func (s *Scanner) scanNumber(ch int) (int, int) {
+       // isDecimal(ch)
+       if ch == '0' {
+               // int or float
+               ch = s.next()
+               if ch == 'x' || ch == 'X' {
+                       // hexadecimal int
+                       ch = s.next()
+                       for digitVal(ch) < 16 {
+                               ch = s.next()
+                       }
+               } else {
+                       // octal int or float
+                       seenDecimalDigit := false
+                       for isDecimal(ch) {
+                               if ch > '7' {
+                                       seenDecimalDigit = true
+                               }
+                               ch = s.next()
+                       }
+                       if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
+                               // float
+                               ch = s.scanFraction(ch)
+                               ch = s.scanExponent(ch)
+                               return Float, ch
+                       }
+                       // octal int
+                       if seenDecimalDigit {
+                               s.error("illegal octal number")
+                       }
+               }
+               return Int, ch
+       }
+       // decimal int or float
+       ch = s.scanMantissa(ch)
+       if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
+               // float
+               ch = s.scanFraction(ch)
+               ch = s.scanExponent(ch)
+               return Float, ch
+       }
+       return Int, ch
+}
+
+
+func (s *Scanner) scanDigits(ch, base, n int) int {
+       for n > 0 && digitVal(ch) < base {
+               ch = s.next()
+               n--
+       }
+       if n > 0 {
+               s.error("illegal char escape")
+       }
+       return ch
+}
+
+
+func (s *Scanner) scanEscape(quote int) int {
+       ch := s.next() // read character after '/'
+       switch ch {
+       case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
+               // nothing to do
+               ch = s.next()
+       case '0', '1', '2', '3', '4', '5', '6', '7':
+               ch = s.scanDigits(ch, 8, 3)
+       case 'x':
+               ch = s.scanDigits(s.next(), 16, 2)
+       case 'u':
+               ch = s.scanDigits(s.next(), 16, 4)
+       case 'U':
+               ch = s.scanDigits(s.next(), 16, 8)
+       default:
+               s.error("illegal char escape")
+       }
+       return ch
+}
+
+
+func (s *Scanner) scanString(quote int) (n int) {
+       ch := s.next() // read character after quote
+       for ch != quote {
+               if ch == '\n' || ch < 0 {
+                       s.error("literal not terminated")
+                       return
+               }
+               if ch == '\\' {
+                       ch = s.scanEscape(quote)
+               } else {
+                       ch = s.next()
+               }
+               n++
+       }
+       return
+}
+
+
+func (s *Scanner) scanRawString() {
+       ch := s.next() // read character after '`'
+       for ch != '`' {
+               if ch < 0 {
+                       s.error("literal not terminated")
+                       return
+               }
+               ch = s.next()
+       }
+}
+
+
+func (s *Scanner) scanChar() {
+       if s.scanString('\'') != 1 {
+               s.error("illegal char literal")
+       }
+}
+
+
+func (s *Scanner) scanLineComment() {
+       ch := s.next() // read character after "//"
+       for ch != '\n' {
+               if ch < 0 {
+                       s.error("comment not terminated")
+                       return
+               }
+               ch = s.next()
+       }
+}
+
+
+func (s *Scanner) scanGeneralComment() {
+       ch := s.next() // read character after "/*"
+       for {
+               if ch < 0 {
+                       s.error("comment not terminated")
+                       return
+               }
+               ch0 := ch
+               ch = s.next()
+               if ch0 == '*' && ch == '/' {
+                       break
+               }
+       }
+}
+
+
+func (s *Scanner) scanComment(ch int) {
+       // ch == '/' || ch == '*'
+       if ch == '/' {
+               s.scanLineComment()
+               return
+       }
+       s.scanGeneralComment()
+}
+
+
+// Scan reads the next token or Unicode character from source and returns it.
+// It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
+// It returns EOF at the end of the source. It reports scanner errors (read and
+// token errors) by calling s.Error, if set; otherwise it prints an error message
+// to os.Stderr.
+func (s *Scanner) Scan() int {
+       ch := s.ch
+
+       // reset token text position
+       s.tokPos = -1
+
+redo:
+       // skip white space
+       for s.Whitespace&(1<<uint(ch)) != 0 {
+               ch = s.next()
+       }
+
+       // start collecting token text
+       s.tokBuf.Reset()
+       s.tokPos = s.srcPos - 1
+
+       // set token position
+       s.Offset = s.srcBufOffset + s.tokPos
+       s.Line = s.line
+       s.Column = s.column
+
+       // determine token value
+       tok := ch
+       switch {
+       case unicode.IsLetter(ch) || ch == '_':
+               if s.Mode&ScanIdents != 0 {
+                       tok = Ident
+                       ch = s.scanIdentifier()
+               } else {
+                       ch = s.next()
+               }
+       case isDecimal(ch):
+               if s.Mode&(ScanInts|ScanFloats) != 0 {
+                       tok, ch = s.scanNumber(ch)
+               } else {
+                       ch = s.next()
+               }
+       default:
+               switch ch {
+               case '"':
+                       if s.Mode&ScanStrings != 0 {
+                               s.scanString('"')
+                               tok = String
+                       }
+                       ch = s.next()
+               case '\'':
+                       if s.Mode&ScanChars != 0 {
+                               s.scanChar()
+                               tok = Char
+                       }
+                       ch = s.next()
+               case '.':
+                       ch = s.next()
+                       if isDecimal(ch) && s.Mode&ScanFloats != 0 {
+                               tok = Float
+                               ch = s.scanMantissa(ch)
+                               ch = s.scanExponent(ch)
+                       }
+               case '/':
+                       ch = s.next()
+                       if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
+                               if s.Mode&SkipComments != 0 {
+                                       s.tokPos = -1 // don't collect token text
+                                       s.scanComment(ch)
+                                       ch = s.next()
+                                       goto redo
+                               }
+                               s.scanComment(ch)
+                               tok = Comment
+                               ch = s.next()
+                       }
+               case '`':
+                       if s.Mode&ScanRawStrings != 0 {
+                               s.scanRawString()
+                               tok = String
+                       }
+                       ch = s.next()
+               default:
+                       ch = s.next()
+               }
+       }
+
+       // end of token text
+       s.tokEnd = s.srcPos - 1
+
+       s.ch = ch
+       return tok
+}
+
+
+// Position returns the current source position. If called before Next()
+// or Scan(), it returns the position of the next Unicode character or token
+// returned by these functions. If called afterwards, it returns the position
+// immediately after the last character of the most recent token or character
+// scanned.
+func (s *Scanner) Pos() Position {
+       return Position{
+               s.Filename,
+               s.srcBufOffset + s.srcPos - 1,
+               s.line,
+               s.column,
+       }
+}
+
+
+// TokenText returns the string corresponding to the most recently scanned token.
+// Valid after calling Scan().
+func (s *Scanner) TokenText() string {
+       if s.tokPos < 0 {
+               // no token text
+               return ""
+       }
+
+       if s.tokEnd < 0 {
+               // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
+               s.tokEnd = s.tokPos
+       }
+
+       if s.tokBuf.Len() == 0 {
+               // common case: the entire token text is still in srcBuf
+               return string(s.srcBuf[s.tokPos:s.tokEnd])
+       }
+
+       // part of the token text was saved in tokBuf: save the rest in
+       // tokBuf as well and return its content
+       s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
+       s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
+       return s.tokBuf.String()
+}
diff --git a/src/pkg/scanner/scanner_test.go b/src/pkg/scanner/scanner_test.go

new file mode 100644 (file)

index 0000000..9260480
--- /dev/null
+++ b/src/pkg/scanner/scanner_test.go
@@ -0,0 +1,482 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scanner
+
+import (
+       "bytes"
+       "fmt"
+       "os"
+       "strings"
+       "testing"
+)
+
+
+// A StringReader delivers its data one string segment at a time via Read.
+type StringReader struct {
+       data []string
+       step int
+}
+
+
+func (r *StringReader) Read(p []byte) (n int, err os.Error) {
+       if r.step < len(r.data) {
+               s := r.data[r.step]
+               for i := 0; i < len(s); i++ {
+                       p[i] = s[i]
+               }
+               n = len(s)
+               r.step++
+       } else {
+               err = os.EOF
+       }
+       return
+}
+
+
+func readRuneSegments(t *testing.T, segments []string) {
+       got := ""
+       want := strings.Join(segments, "")
+       s := new(Scanner).Init(&StringReader{data: segments})
+       for {
+               ch := s.Next()
+               if ch == EOF {
+                       break
+               }
+               got += string(ch)
+       }
+       if got != want {
+               t.Errorf("segments=%v got=%s want=%s", segments, got, want)
+       }
+}
+
+
+var segmentList = [][]string{
+       []string{},
+       []string{""},
+       []string{"日", "本語"},
+       []string{"\u65e5", "\u672c", "\u8a9e"},
+       []string{"\U000065e5", " ", "\U0000672c", "\U00008a9e"},
+       []string{"\xe6", "\x97\xa5\xe6", "\x9c\xac\xe8\xaa\x9e"},
+       []string{"Hello", ", ", "World", "!"},
+       []string{"Hello", ", ", "", "World", "!"},
+}
+
+
+func TestNext(t *testing.T) {
+       for _, s := range segmentList {
+               readRuneSegments(t, s)
+       }
+}
+
+
+type token struct {
+       tok  int
+       text string
+}
+
+var f100 = "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+
+var tokenList = []token{
+       token{Comment, "// line comments\n"},
+       token{Comment, "//\n"},
+       token{Comment, "////\n"},
+       token{Comment, "// comment\n"},
+       token{Comment, "// /* comment */\n"},
+       token{Comment, "// // comment //\n"},
+       token{Comment, "//" + f100 + "\n"},
+
+       token{Comment, "// general comments\n"},
+       token{Comment, "/**/"},
+       token{Comment, "/***/"},
+       token{Comment, "/* comment */"},
+       token{Comment, "/* // comment */"},
+       token{Comment, "/* /* comment */"},
+       token{Comment, "/*\n comment\n*/"},
+       token{Comment, "/*" + f100 + "*/"},
+
+       token{Comment, "// identifiers\n"},
+       token{Ident, "a"},
+       token{Ident, "a0"},
+       token{Ident, "foobar"},
+       token{Ident, "abc123"},
+       token{Ident, "LGTM"},
+       token{Ident, "_"},
+       token{Ident, "_abc123"},
+       token{Ident, "abc123_"},
+       token{Ident, "_abc_123_"},
+       token{Ident, "_äöü"},
+       token{Ident, "_本"},
+       // TODO for unknown reasons these fail when checking the literals
+       /*
+               token{Ident, "äöü"},
+               token{Ident, "本"},
+       */
+       token{Ident, "a۰۱۸"},
+       token{Ident, "foo६४"},
+       token{Ident, "bar９８７６"},
+       token{Ident, f100},
+
+       token{Comment, "// decimal ints\n"},
+       token{Int, "0"},
+       token{Int, "1"},
+       token{Int, "9"},
+       token{Int, "42"},
+       token{Int, "1234567890"},
+
+       token{Comment, "// octal ints\n"},
+       token{Int, "00"},
+       token{Int, "01"},
+       token{Int, "07"},
+       token{Int, "042"},
+       token{Int, "01234567"},
+
+       token{Comment, "// hexadecimal ints\n"},
+       token{Int, "0x0"},
+       token{Int, "0x1"},
+       token{Int, "0xf"},
+       token{Int, "0x42"},
+       token{Int, "0x123456789abcDEF"},
+       token{Int, "0x" + f100},
+       token{Int, "0X0"},
+       token{Int, "0X1"},
+       token{Int, "0XF"},
+       token{Int, "0X42"},
+       token{Int, "0X123456789abcDEF"},
+       token{Int, "0X" + f100},
+
+       token{Comment, "// floats\n"},
+       token{Float, "0."},
+       token{Float, "1."},
+       token{Float, "42."},
+       token{Float, "01234567890."},
+       token{Float, ".0"},
+       token{Float, ".1"},
+       token{Float, ".42"},
+       token{Float, ".0123456789"},
+       token{Float, "0.0"},
+       token{Float, "1.0"},
+       token{Float, "42.0"},
+       token{Float, "01234567890.0"},
+       token{Float, "0e0"},
+       token{Float, "1e0"},
+       token{Float, "42e0"},
+       token{Float, "01234567890e0"},
+       token{Float, "0E0"},
+       token{Float, "1E0"},
+       token{Float, "42E0"},
+       token{Float, "01234567890E0"},
+       token{Float, "0e+10"},
+       token{Float, "1e-10"},
+       token{Float, "42e+10"},
+       token{Float, "01234567890e-10"},
+       token{Float, "0E+10"},
+       token{Float, "1E-10"},
+       token{Float, "42E+10"},
+       token{Float, "01234567890E-10"},
+
+       token{Comment, "// chars\n"},
+       token{Char, `' '`},
+       token{Char, `'a'`},
+       token{Char, `'本'`},
+       token{Char, `'\a'`},
+       token{Char, `'\b'`},
+       token{Char, `'\f'`},
+       token{Char, `'\n'`},
+       token{Char, `'\r'`},
+       token{Char, `'\t'`},
+       token{Char, `'\v'`},
+       token{Char, `'\''`},
+       token{Char, `'\000'`},
+       token{Char, `'\777'`},
+       token{Char, `'\x00'`},
+       token{Char, `'\xff'`},
+       token{Char, `'\u0000'`},
+       token{Char, `'\ufA16'`},
+       token{Char, `'\U00000000'`},
+       token{Char, `'\U0000ffAB'`},
+
+       token{Comment, "// strings\n"},
+       token{String, `" "`},
+       token{String, `"a"`},
+       token{String, `"本"`},
+       token{String, `"\a"`},
+       token{String, `"\b"`},
+       token{String, `"\f"`},
+       token{String, `"\n"`},
+       token{String, `"\r"`},
+       token{String, `"\t"`},
+       token{String, `"\v"`},
+       token{String, `"\""`},
+       token{String, `"\000"`},
+       token{String, `"\777"`},
+       token{String, `"\x00"`},
+       token{String, `"\xff"`},
+       token{String, `"\u0000"`},
+       token{String, `"\ufA16"`},
+       token{String, `"\U00000000"`},
+       token{String, `"\U0000ffAB"`},
+       token{String, `"` + f100 + `"`},
+
+       token{Comment, "// raw strings\n"},
+       token{String, "``"},
+       token{String, "`\\`"},
+       token{String, "`" + "\n\n/* foobar */\n\n" + "`"},
+       token{String, "`" + f100 + "`"},
+
+       token{Comment, "// individual characters\n"},
+       token{'\x00', "\x00"},
+       token{'\x01', "\x01"},
+       token{' ' - 1, string(' ' - 1)},
+       token{'+', "+"},
+       token{'/', "/"},
+       token{'.', "."},
+       token{'~', "~"},
+       token{'(', "("},
+}
+
+
+func makeSource(pattern string) *bytes.Buffer {
+       var buf bytes.Buffer
+       for _, k := range tokenList {
+               fmt.Fprintf(&buf, pattern, k.text)
+       }
+       return &buf
+}
+
+
+func checkTok(t *testing.T, s *Scanner, line, got, want int, text string) {
+       if got != want {
+               t.Fatalf("tok = %s, want %s for %q", TokenString(got), TokenString(want), text)
+       }
+       if s.Line != line {
+               t.Errorf("line = %d, want %d for %q", s.Line, line, text)
+       }
+       stext := s.TokenText()
+       if stext != text {
+               t.Errorf("text = %q, want %q", stext, text)
+       } else {
+               // check idempotency of TokenText() call
+               stext = s.TokenText()
+               if stext != text {
+                       t.Errorf("text = %q, want %q (idempotency check)", stext, text)
+               }
+       }
+}
+
+
+func countNewlines(s string) int {
+       n := 0
+       for _, ch := range s {
+               if ch == '\n' {
+                       n++
+               }
+       }
+       return n
+}
+
+
+func testScan(t *testing.T, mode uint) {
+       s := new(Scanner).Init(makeSource(" \t%s\t\n\r"))
+       s.Mode = mode
+       tok := s.Scan()
+       line := 1
+       for _, k := range tokenList {
+               if mode&SkipComments == 0 || k.tok != Comment {
+                       checkTok(t, s, line, tok, k.tok, k.text)
+                       tok = s.Scan()
+               }
+               line += countNewlines(k.text) + 1 // each token is on a new line
+       }
+       checkTok(t, s, line, tok, -1, "")
+}
+
+
+func TestScan(t *testing.T) {
+       testScan(t, GoTokens)
+       testScan(t, GoTokens&^SkipComments)
+}
+
+
+func TestPosition(t *testing.T) {
+       src := makeSource("\t\t\t\t%s\n")
+       s := new(Scanner).Init(src)
+       s.Mode = GoTokens &^ SkipComments
+       s.Scan()
+       pos := Position{"", 4, 1, 5}
+       for _, k := range tokenList {
+               if s.Offset != pos.Offset {
+                       t.Errorf("offset = %d, want %d for %q", s.Offset, pos.Offset, k.text)
+               }
+               if s.Line != pos.Line {
+                       t.Errorf("line = %d, want %d for %q", s.Line, pos.Line, k.text)
+               }
+               if s.Column != pos.Column {
+                       t.Errorf("column = %d, want %d for %q", s.Column, pos.Column, k.text)
+               }
+               pos.Offset += 4 + len(k.text) + 1     // 4 tabs + token bytes + newline
+               pos.Line += countNewlines(k.text) + 1 // each token is on a new line
+               s.Scan()
+       }
+}
+
+
+func TestScanZeroMode(t *testing.T) {
+       src := makeSource("%s\n")
+       str := src.String()
+       s := new(Scanner).Init(src)
+       s.Mode = 0       // don't recognize any token classes
+       s.Whitespace = 0 // don't skip any whitespace
+       tok := s.Scan()
+       for i, ch := range str {
+               if tok != ch {
+                       t.Fatalf("%d. tok = %s, want %s", i, TokenString(tok), TokenString(ch))
+               }
+               tok = s.Scan()
+       }
+       if tok != EOF {
+               t.Fatalf("tok = %s, want EOF", TokenString(tok))
+       }
+}
+
+
+func testScanSelectedMode(t *testing.T, mode uint, class int) {
+       src := makeSource("%s\n")
+       s := new(Scanner).Init(src)
+       s.Mode = mode
+       tok := s.Scan()
+       for tok != EOF {
+               if tok < 0 && tok != class {
+                       t.Fatalf("tok = %s, want %s", TokenString(tok), TokenString(class))
+               }
+               tok = s.Scan()
+       }
+}
+
+
+func TestScanSelectedMask(t *testing.T) {
+       testScanSelectedMode(t, 0, 0)
+       testScanSelectedMode(t, ScanIdents, Ident)
+       // Don't test ScanInts and ScanNumbers since some parts of
+       // the floats in the source look like (illegal) octal ints
+       // and ScanNumbers may return either Int or Float.
+       testScanSelectedMode(t, ScanChars, Char)
+       testScanSelectedMode(t, ScanStrings, String)
+       testScanSelectedMode(t, SkipComments, 0)
+       testScanSelectedMode(t, ScanComments, Comment)
+}
+
+
+func TestScanNext(t *testing.T) {
+       s := new(Scanner).Init(bytes.NewBufferString("if a == bcd /* comment */ {\n\ta += c\n}"))
+       checkTok(t, s, 1, s.Scan(), Ident, "if")
+       checkTok(t, s, 1, s.Scan(), Ident, "a")
+       checkTok(t, s, 1, s.Scan(), '=', "=")
+       checkTok(t, s, 1, s.Next(), '=', "")
+       checkTok(t, s, 1, s.Next(), ' ', "")
+       checkTok(t, s, 1, s.Next(), 'b', "")
+       checkTok(t, s, 1, s.Scan(), Ident, "cd")
+       checkTok(t, s, 1, s.Scan(), '{', "{")
+       checkTok(t, s, 2, s.Scan(), Ident, "a")
+       checkTok(t, s, 2, s.Scan(), '+', "+")
+       checkTok(t, s, 2, s.Next(), '=', "")
+       checkTok(t, s, 2, s.Scan(), Ident, "c")
+       checkTok(t, s, 3, s.Scan(), '}', "}")
+       checkTok(t, s, 3, s.Scan(), -1, "")
+}
+
+
+func TestScanWhitespace(t *testing.T) {
+       var buf bytes.Buffer
+       var ws uint64
+       for ch := byte(0); ch < ' '; ch++ {
+               buf.WriteByte(ch)
+               ws |= 1 << ch
+       }
+       const orig = 'x'
+       buf.WriteByte(orig)
+
+       s := new(Scanner).Init(&buf)
+       s.Mode = 0
+       s.Whitespace = ws
+       tok := s.Scan()
+       if tok != orig {
+               t.Errorf("tok = %s, want %s", TokenString(tok), TokenString(orig))
+       }
+}
+
+
+func testError(t *testing.T, src, msg string, tok int) {
+       s := new(Scanner).Init(bytes.NewBufferString(src))
+       errorCalled := false
+       s.Error = func(s *Scanner, m string) {
+               if !errorCalled {
+                       // only look at first error
+                       if m != msg {
+                               t.Errorf("msg = %q, want %q for %q", m, msg, src)
+                       }
+                       errorCalled = true
+               }
+       }
+       tk := s.Scan()
+       if tk != tok {
+               t.Errorf("tok = %s, want %s for %q", TokenString(tk), TokenString(tok), src)
+       }
+       if !errorCalled {
+               t.Errorf("error handler not called for %q", src)
+       }
+       if s.ErrorCount == 0 {
+               t.Errorf("count = %d, want > 0 for %q", s.ErrorCount, src)
+       }
+}
+
+
+func TestError(t *testing.T) {
+       testError(t, `01238`, "illegal octal number", Int)
+       testError(t, `'\"'`, "illegal char escape", Char)
+       testError(t, `'aa'`, "illegal char literal", Char)
+       testError(t, `'`, "literal not terminated", Char)
+       testError(t, `"\'"`, "illegal char escape", String)
+       testError(t, `"abc`, "literal not terminated", String)
+       testError(t, "`abc", "literal not terminated", String)
+       testError(t, `//`, "comment not terminated", EOF)
+       testError(t, `/*/`, "comment not terminated", EOF)
+}
+
+
+func checkPos(t *testing.T, s *Scanner, offset, line, column, char int) {
+       pos := s.Pos()
+       if pos.Offset != offset {
+               t.Errorf("offset = %d, want %d", pos.Offset, offset)
+       }
+       if pos.Line != line {
+               t.Errorf("line = %d, want %d", pos.Line, line)
+       }
+       if pos.Column != column {
+               t.Errorf("column = %d, want %d", pos.Column, column)
+       }
+       ch := s.Scan()
+       if ch != char {
+               t.Errorf("ch = %s, want %s", TokenString(ch), TokenString(char))
+       }
+}
+
+
+func TestPos(t *testing.T) {
+       s := new(Scanner).Init(bytes.NewBufferString("abc\n012\n\nx"))
+       s.Mode = 0
+       s.Whitespace = 0
+       checkPos(t, s, 0, 1, 1, 'a')
+       checkPos(t, s, 1, 1, 2, 'b')
+       checkPos(t, s, 2, 1, 3, 'c')
+       checkPos(t, s, 3, 2, 0, '\n')
+       checkPos(t, s, 4, 2, 1, '0')
+       checkPos(t, s, 5, 2, 2, '1')
+       checkPos(t, s, 6, 2, 3, '2')
+       checkPos(t, s, 7, 3, 0, '\n')
+       checkPos(t, s, 8, 4, 0, '\n')
+       checkPos(t, s, 9, 4, 1, 'x')
+       checkPos(t, s, 9, 4, 1, EOF)
+       checkPos(t, s, 9, 4, 1, EOF) // after EOF, position doesn't change
+}
author	Robert Griesemer <gri@golang.org>
	Wed, 13 Jan 2010 01:04:45 +0000 (17:04 -0800)
committer	Robert Griesemer <gri@golang.org>
	Wed, 13 Jan 2010 01:04:45 +0000 (17:04 -0800)
src/pkg/Makefile		patch \| blob \| history
src/pkg/scanner/Makefile	[new file with mode: 0644]	patch \| blob
src/pkg/scanner/scanner.go	[new file with mode: 0644]	patch \| blob
src/pkg/scanner/scanner_test.go	[new file with mode: 0644]	patch \| blob