From: Robert Griesemer Date: Mon, 16 Jun 2014 23:32:47 +0000 (-0700) Subject: text/scanner: provide facility for custom identifiers X-Git-Tag: go1.4beta1~1284 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=60c0b3b5cf89c8054328e27d7ee58da79a12999e;p=gostls13.git text/scanner: provide facility for custom identifiers LGTM=r R=golang-codereviews, r CC=golang-codereviews https://golang.org/cl/108030044 --- diff --git a/src/pkg/text/scanner/scanner.go b/src/pkg/text/scanner/scanner.go index db7ca73c68..e7cdd33366 100644 --- a/src/pkg/text/scanner/scanner.go +++ b/src/pkg/text/scanner/scanner.go @@ -11,7 +11,7 @@ // By default, a Scanner skips white space and Go comments and recognizes all // literals as defined by the Go language specification. It may be // customized to recognize only a subset of those literals and to recognize -// different white space characters. +// different identifier and white space characters. // // Basic usage pattern: // @@ -34,8 +34,6 @@ import ( "unicode/utf8" ) -// TODO(gri): Consider changing this to use the new (token) Position package. - // A source position is represented by a Position value. // A position is valid if Line > 0. type Position struct { @@ -164,6 +162,13 @@ type Scanner struct { // for values ch > ' '). The field may be changed at any time. Whitespace uint64 + // IsIdentRune is a predicate controlling the characters accepted + // as the ith rune in an identifier. The set of valid characters + // must not intersect with the set of white space characters. + // If no IsIdentRune function is set, regular Go identifiers are + // accepted instead. The field may be changed at any time. + IsIdentRune func(ch rune, i int) bool + // Start position of most recently scanned token; set by Scan. // Calling Init or Next invalidates the position (Line == 0). // The Filename field is always left untouched by the Scanner. @@ -334,9 +339,17 @@ func (s *Scanner) error(msg string) { fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) } +func (s *Scanner) isIdentRune(ch rune, i int) bool { + if s.IsIdentRune != nil { + return s.IsIdentRune(ch, i) + } + return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0 +} + func (s *Scanner) scanIdentifier() rune { - ch := s.next() // read character after first '_' or letter - for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) { + // we know the zero'th rune is OK; start with 2nd one + ch := s.next() + for i := 1; s.isIdentRune(ch, i); i++ { ch = s.next() } return ch @@ -563,7 +576,7 @@ redo: // determine token value tok := ch switch { - case unicode.IsLetter(ch) || ch == '_': + case s.isIdentRune(ch, 0): if s.Mode&ScanIdents != 0 { tok = Ident ch = s.scanIdentifier() diff --git a/src/pkg/text/scanner/scanner_test.go b/src/pkg/text/scanner/scanner_test.go index 7d3f597eb9..702fac2b1a 100644 --- a/src/pkg/text/scanner/scanner_test.go +++ b/src/pkg/text/scanner/scanner_test.go @@ -357,6 +357,28 @@ func TestScanSelectedMask(t *testing.T) { testScanSelectedMode(t, ScanComments, Comment) } +func TestScanCustomIdent(t *testing.T) { + const src = "faab12345 a12b123 a12 3b" + s := new(Scanner).Init(strings.NewReader(src)) + // ident = ( 'a' | 'b' ) { digit } . + // digit = '0' .. '3' . + // with a maximum length of 4 + s.IsIdentRune = func(ch rune, i int) bool { + return i == 0 && (ch == 'a' || ch == 'b') || 0 < i && i < 4 && '0' <= ch && ch <= '3' + } + checkTok(t, s, 1, s.Scan(), 'f', "f") + checkTok(t, s, 1, s.Scan(), Ident, "a") + checkTok(t, s, 1, s.Scan(), Ident, "a") + checkTok(t, s, 1, s.Scan(), Ident, "b123") + checkTok(t, s, 1, s.Scan(), Int, "45") + checkTok(t, s, 1, s.Scan(), Ident, "a12") + checkTok(t, s, 1, s.Scan(), Ident, "b123") + checkTok(t, s, 1, s.Scan(), Ident, "a12") + checkTok(t, s, 1, s.Scan(), Int, "3") + checkTok(t, s, 1, s.Scan(), Ident, "b") + checkTok(t, s, 1, s.Scan(), EOF, "") +} + func TestScanNext(t *testing.T) { const BOM = '\uFEFF' BOMs := string(BOM)