go/scanner: accept new Go2 number literals

author Robert Griesemer <gri@golang.org>

Mon, 28 Jan 2019 23:29:23 +0000 (15:29 -0800)

committer Robert Griesemer <gri@golang.org>

Mon, 11 Feb 2019 23:23:32 +0000 (23:23 +0000)
author Robert Griesemer <gri@golang.org>
Mon, 28 Jan 2019 23:29:23 +0000 (15:29 -0800)
committer Robert Griesemer <gri@golang.org>
Mon, 11 Feb 2019 23:23:32 +0000 (23:23 +0000)
diff --git a/src/go/scanner/scanner.go b/src/go/scanner/scanner.go

index e78abf12a27cf445c7cc129877fc3f3fa493927c..9e85d4898a5c0a549e8fe660f3b17fe173bb35f2 100644 (file)
--- a/src/go/scanner/scanner.go
+++ b/src/go/scanner/scanner.go
@@ -150,6 +150,10 @@ func (s *Scanner) error(offs int, msg string) {
         s.ErrorCount++
  }
  
+func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
+       s.error(offs, fmt.Sprintf(format, args...))
+}
+
  func (s *Scanner) scanComment() string {
         // initial '/' already consumed; s.ch == '/' || s.ch == '*'
         offs := s.offset - 1 // position of initial '/'
@@ -336,11 +340,11 @@ func (s *Scanner) findLineEnd() bool {
  }
  
  func isLetter(ch rune) bool {
-       return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
+       return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
  }
  
  func isDigit(ch rune) bool {
-       return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
+       return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
  }
  
  func (s *Scanner) scanIdentifier() string {
@@ -355,95 +359,188 @@ func digitVal(ch rune) int {
         switch {
         case '0' <= ch && ch <= '9':
                 return int(ch - '0')
-       case 'a' <= ch && ch <= 'f':
-               return int(ch - 'a' + 10)
-       case 'A' <= ch && ch <= 'F':
-               return int(ch - 'A' + 10)
+       case 'a' <= lower(ch) && lower(ch) <= 'f':
+               return int(lower(ch) - 'a' + 10)
         }
         return 16 // larger than any legal digit val
  }
  
-func (s *Scanner) scanMantissa(base int) {
-       for digitVal(s.ch) < base {
-               s.next()
+func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
+func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
+func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
+
+// digits accepts the sequence { digit | '_' }.
+// If base <= 10, digits accepts any decimal digit but records
+// the offset (relative to the source start) of a digit >= base
+// in *invalid, if *invalid < 0.
+// digits returns a bitset describing whether the sequence contained
+// digits (bit 0 is set), or separators '_' (bit 1 is set).
+func (s *Scanner) digits(base int, invalid *int) (digsep int) {
+       if base <= 10 {
+               max := rune('0' + base)
+               for isDecimal(s.ch) || s.ch == '_' {
+                       ds := 1
+                       if s.ch == '_' {
+                               ds = 2
+                       } else if s.ch >= max && *invalid < 0 {
+                               *invalid = int(s.offset) // record invalid rune offset
+                       }
+                       digsep |= ds
+                       s.next()
+               }
+       } else {
+               for isHex(s.ch) || s.ch == '_' {
+                       ds := 1
+                       if s.ch == '_' {
+                               ds = 2
+                       }
+                       digsep |= ds
+                       s.next()
+               }
         }
+       return
  }
  
-func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
-       // digitVal(s.ch) < 10
+func (s *Scanner) scanNumber() (token.Token, string) {
         offs := s.offset
-       tok := token.INT
+       tok := token.ILLEGAL
  
-       if seenDecimalPoint {
-               offs--
-               tok = token.FLOAT
-               s.scanMantissa(10)
-               goto exponent
-       }
+       base := 10        // number base
+       prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
+       digsep := 0       // bit 0: digit present, bit 1: '_' present
+       invalid := -1     // index of invalid digit in literal, or < 0
  
-       if s.ch == '0' {
-               // int or float
-               offs := s.offset
-               s.next()
-               if s.ch == 'x' || s.ch == 'X' {
-                       // hexadecimal int
+       // integer part
+       if s.ch != '.' {
+               tok = token.INT
+               if s.ch == '0' {
                         s.next()
-                       s.scanMantissa(16)
-                       if s.offset-offs <= 2 {
-                               // only scanned "0x" or "0X"
-                               s.error(offs, "illegal hexadecimal number")
-                       }
-               } else {
-                       // octal int or float
-                       seenDecimalDigit := false
-                       s.scanMantissa(8)
-                       if s.ch == '8' || s.ch == '9' {
-                               // illegal octal int or float
-                               seenDecimalDigit = true
-                               s.scanMantissa(10)
-                       }
-                       if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
-                               goto fraction
-                       }
-                       // octal int
-                       if seenDecimalDigit {
-                               s.error(offs, "illegal octal number")
+                       switch lower(s.ch) {
+                       case 'x':
+                               s.next()
+                               base, prefix = 16, 'x'
+                       case 'o':
+                               s.next()
+                               base, prefix = 8, 'o'
+                       case 'b':
+                               s.next()
+                               base, prefix = 2, 'b'
+                       default:
+                               base, prefix = 8, '0'
+                               digsep = 1 // leading 0
                         }
                 }
-               goto exit
+               digsep |= s.digits(base, &invalid)
         }
  
-       // decimal int or float
-       s.scanMantissa(10)
-
-fraction:
+       // fractional part
         if s.ch == '.' {
                 tok = token.FLOAT
+               if prefix == 'o' || prefix == 'b' {
+                       s.error(s.offset, "invalid radix point in "+litname(prefix))
+               }
                 s.next()
-               s.scanMantissa(10)
+               digsep |= s.digits(base, &invalid)
         }
  
-exponent:
-       if s.ch == 'e' || s.ch == 'E' {
-               tok = token.FLOAT
+       if digsep&1 == 0 {
+               s.error(s.offset, litname(prefix)+" has no digits")
+       }
+
+       // exponent
+       if e := lower(s.ch); e == 'e' || e == 'p' {
+               switch {
+               case e == 'e' && prefix != 0 && prefix != '0':
+                       s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
+               case e == 'p' && prefix != 'x':
+                       s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
+               }
                 s.next()
-               if s.ch == '-' || s.ch == '+' {
+               tok = token.FLOAT
+               if s.ch == '+' || s.ch == '-' {
                         s.next()
                 }
-               if digitVal(s.ch) < 10 {
-                       s.scanMantissa(10)
-               } else {
-                       s.error(offs, "illegal floating-point exponent")
+               ds := s.digits(10, nil)
+               digsep |= ds
+               if ds&1 == 0 {
+                       s.error(s.offset, "exponent has no digits")
                 }
+       } else if prefix == 'x' && tok == token.FLOAT {
+               s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
         }
  
+       // suffix 'i'
         if s.ch == 'i' {
                 tok = token.IMAG
+               if prefix != 0 && prefix != '0' {
+                       s.error(s.offset, "invalid suffix 'i' on "+litname(prefix))
+               }
                 s.next()
         }
  
-exit:
-       return tok, string(s.src[offs:s.offset])
+       lit := string(s.src[offs:s.offset])
+       if tok == token.INT && invalid >= 0 {
+               s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
+       }
+       if digsep&2 != 0 {
+               if i := invalidSep(lit); i >= 0 {
+                       s.error(offs+i, "'_' must separate successive digits")
+               }
+       }
+
+       return tok, lit
+}
+
+func litname(prefix rune) string {
+       switch prefix {
+       case 'x':
+               return "hexadecimal literal"
+       case 'o', '0':
+               return "octal literal"
+       case 'b':
+               return "binary literal"
+       }
+       return "decimal literal"
+}
+
+// invalidSep returns the index of the first invalid separator in x, or -1.
+func invalidSep(x string) int {
+       x1 := ' ' // prefix char, we only care if it's 'x'
+       d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
+       i := 0
+
+       // a prefix counts as a digit
+       if len(x) >= 2 && x[0] == '0' {
+               x1 = lower(rune(x[1]))
+               if x1 == 'x' || x1 == 'o' || x1 == 'b' {
+                       d = '0'
+                       i = 2
+               }
+       }
+
+       // mantissa and exponent
+       for ; i < len(x); i++ {
+               p := d // previous digit
+               d = rune(x[i])
+               switch {
+               case d == '_':
+                       if p != '0' {
+                               return i
+                       }
+               case isDecimal(d) || x1 == 'x' && isHex(d):
+                       d = '0'
+               default:
+                       if p == '_' {
+                               return i - 1
+                       }
+                       d = '.'
+               }
+       }
+       if d == '_' {
+               return len(x) - 1
+       }
+
+       return -1
  }
  
  // scanEscape parses an escape sequence where rune is the accepted
@@ -708,9 +805,9 @@ scanAgain:
                         insertSemi = true
                         tok = token.IDENT
                 }
-       case '0' <= ch && ch <= '9':
+       case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
                 insertSemi = true
-               tok, lit = s.scanNumber(false)
+               tok, lit = s.scanNumber()
         default:
                 s.next() // always make progress
                 switch ch {
@@ -741,16 +838,12 @@ scanAgain:
                 case ':':
                         tok = s.switch2(token.COLON, token.DEFINE)
                 case '.':
-                       if '0' <= s.ch && s.ch <= '9' {
-                               insertSemi = true
-                               tok, lit = s.scanNumber(true)
-                       } else {
-                               tok = token.PERIOD
-                               if s.ch == '.' && s.peek() == '.' {
-                                       s.next()
-                                       s.next() // consume last '.'
-                                       tok = token.ELLIPSIS
-                               }
+                       // fractions starting with a '.' are handled by outer switch
+                       tok = token.PERIOD
+                       if s.ch == '.' && s.peek() == '.' {
+                               s.next()
+                               s.next() // consume last '.'
+                               tok = token.ELLIPSIS
                         }
                 case ',':
                         tok = token.COMMA
@@ -835,7 +928,7 @@ scanAgain:
                 default:
                         // next reports unexpected BOMs - don't repeat
                         if ch != bom {
-                               s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
+                               s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
                         }
                         insertSemi = s.insertSemi // preserve insertSemi info
                         tok = token.ILLEGAL
diff --git a/src/go/scanner/scanner_test.go b/src/go/scanner/scanner_test.go

index 36c962209ce1dc5addb02c071dce54ffc77a2c7c..1d6865f198b006740c1faeabb654febd78ee90a4 100644 (file)
--- a/src/go/scanner/scanner_test.go
+++ b/src/go/scanner/scanner_test.go
@@ -10,6 +10,7 @@ import (
         "os"
         "path/filepath"
         "runtime"
+       "strings"
         "testing"
  )
  
@@ -802,11 +803,10 @@ var errors = []struct {
         {"078.", token.FLOAT, 0, "078.", ""},
         {"07801234567.", token.FLOAT, 0, "07801234567.", ""},
         {"078e0", token.FLOAT, 0, "078e0", ""},
-       {"0E", token.FLOAT, 0, "0E", "illegal floating-point exponent"}, // issue 17621
-       {"078", token.INT, 0, "078", "illegal octal number"},
-       {"07800000009", token.INT, 0, "07800000009", "illegal octal number"},
-       {"0x", token.INT, 0, "0x", "illegal hexadecimal number"},
-       {"0X", token.INT, 0, "0X", "illegal hexadecimal number"},
+       {"0E", token.FLOAT, 2, "0E", "exponent has no digits"}, // issue 17621
+       {"078", token.INT, 2, "078", "invalid digit '8' in octal literal"},
+       {"07090000008", token.INT, 3, "07090000008", "invalid digit '9' in octal literal"},
+       {"0x", token.INT, 2, "0x", "hexadecimal literal has no digits"},
         {"\"abc\x00def\"", token.STRING, 4, "\"abc\x00def\"", "illegal character NUL"},
         {"\"abc\x80def\"", token.STRING, 4, "\"abc\x80def\"", "illegal UTF-8 encoding"},
         {"\ufeff\ufeff", token.ILLEGAL, 3, "\ufeff\ufeff", "illegal byte order mark"},                        // only first BOM is ignored
@@ -912,3 +912,199 @@ func BenchmarkScanFile(b *testing.B) {
                 }
         }
  }
+
+func TestNumbers(t *testing.T) {
+       for _, test := range []struct {
+               tok              token.Token
+               src, tokens, err string
+       }{
+               // binaries
+               {token.INT, "0b0", "0b0", ""},
+               {token.INT, "0b1010", "0b1010", ""},
+               {token.INT, "0B1110", "0B1110", ""},
+
+               {token.INT, "0b", "0b", "binary literal has no digits"},
+               {token.INT, "0b0190", "0b0190", "invalid digit '9' in binary literal"},
+               {token.INT, "0b01a0", "0b01 a0", ""}, // only accept 0-9
+
+               // binary floats and imaginaries (invalid)
+               {token.FLOAT, "0b.", "0b.", "invalid radix point in binary literal"},
+               {token.FLOAT, "0b.1", "0b.1", "invalid radix point in binary literal"},
+               {token.FLOAT, "0b1.0", "0b1.0", "invalid radix point in binary literal"},
+               {token.FLOAT, "0b1e10", "0b1e10", "'e' exponent requires decimal mantissa"},
+               {token.FLOAT, "0b1P-1", "0b1P-1", "'P' exponent requires hexadecimal mantissa"},
+               {token.IMAG, "0b10i", "0b10i", "invalid suffix 'i' on binary literal"},
+
+               // octals
+               {token.INT, "0o0", "0o0", ""},
+               {token.INT, "0o1234", "0o1234", ""},
+               {token.INT, "0O1234", "0O1234", ""},
+
+               {token.INT, "0o", "0o", "octal literal has no digits"},
+               {token.INT, "0o8123", "0o8123", "invalid digit '8' in octal literal"},
+               {token.INT, "0o1293", "0o1293", "invalid digit '9' in octal literal"},
+               {token.INT, "0o12a3", "0o12 a3", ""}, // only accept 0-9
+
+               // octal floats and imaginaries (invalid)
+               {token.FLOAT, "0o.", "0o.", "invalid radix point in octal literal"},
+               {token.FLOAT, "0o.2", "0o.2", "invalid radix point in octal literal"},
+               {token.FLOAT, "0o1.2", "0o1.2", "invalid radix point in octal literal"},
+               {token.FLOAT, "0o1E+2", "0o1E+2", "'E' exponent requires decimal mantissa"},
+               {token.FLOAT, "0o1p10", "0o1p10", "'p' exponent requires hexadecimal mantissa"},
+               {token.IMAG, "0o10i", "0o10i", "invalid suffix 'i' on octal literal"},
+
+               // 0-octals
+               {token.INT, "0", "0", ""},
+               {token.INT, "0123", "0123", ""},
+
+               {token.INT, "08123", "08123", "invalid digit '8' in octal literal"},
+               {token.INT, "01293", "01293", "invalid digit '9' in octal literal"},
+               {token.INT, "0F.", "0 F .", ""}, // only accept 0-9
+               {token.INT, "0123F.", "0123 F .", ""},
+               {token.INT, "0123456x", "0123456 x", ""},
+
+               // decimals
+               {token.INT, "1", "1", ""},
+               {token.INT, "1234", "1234", ""},
+
+               {token.INT, "1f", "1 f", ""}, // only accept 0-9
+
+               // decimal floats
+               {token.FLOAT, "0.", "0.", ""},
+               {token.FLOAT, "123.", "123.", ""},
+               {token.FLOAT, "0123.", "0123.", ""},
+
+               {token.FLOAT, ".0", ".0", ""},
+               {token.FLOAT, ".123", ".123", ""},
+               {token.FLOAT, ".0123", ".0123", ""},
+
+               {token.FLOAT, "0.0", "0.0", ""},
+               {token.FLOAT, "123.123", "123.123", ""},
+               {token.FLOAT, "0123.0123", "0123.0123", ""},
+
+               {token.FLOAT, "0e0", "0e0", ""},
+               {token.FLOAT, "123e+0", "123e+0", ""},
+               {token.FLOAT, "0123E-1", "0123E-1", ""},
+
+               {token.FLOAT, "0.e+1", "0.e+1", ""},
+               {token.FLOAT, "123.E-10", "123.E-10", ""},
+               {token.FLOAT, "0123.e123", "0123.e123", ""},
+
+               {token.FLOAT, ".0e-1", ".0e-1", ""},
+               {token.FLOAT, ".123E+10", ".123E+10", ""},
+               {token.FLOAT, ".0123E123", ".0123E123", ""},
+
+               {token.FLOAT, "0.0e1", "0.0e1", ""},
+               {token.FLOAT, "123.123E-10", "123.123E-10", ""},
+               {token.FLOAT, "0123.0123e+456", "0123.0123e+456", ""},
+
+               {token.FLOAT, "0e", "0e", "exponent has no digits"},
+               {token.FLOAT, "0E+", "0E+", "exponent has no digits"},
+               {token.FLOAT, "1e+f", "1e+ f", "exponent has no digits"},
+               {token.FLOAT, "0p0", "0p0", "'p' exponent requires hexadecimal mantissa"},
+               {token.FLOAT, "1.0P-1", "1.0P-1", "'P' exponent requires hexadecimal mantissa"},
+
+               // decimal imaginaries
+               {token.IMAG, "0.i", "0.i", ""},
+               {token.IMAG, ".123i", ".123i", ""},
+               {token.IMAG, "123.123i", "123.123i", ""},
+               {token.IMAG, "123e+0i", "123e+0i", ""},
+               {token.IMAG, "123.E-10i", "123.E-10i", ""},
+               {token.IMAG, ".123E+10i", ".123E+10i", ""},
+
+               // hexadecimals
+               {token.INT, "0x0", "0x0", ""},
+               {token.INT, "0x1234", "0x1234", ""},
+               {token.INT, "0xcafef00d", "0xcafef00d", ""},
+               {token.INT, "0XCAFEF00D", "0XCAFEF00D", ""},
+
+               {token.INT, "0x", "0x", "hexadecimal literal has no digits"},
+               {token.INT, "0x1g", "0x1 g", ""},
+
+               // hexadecimal floats
+               {token.FLOAT, "0x0p0", "0x0p0", ""},
+               {token.FLOAT, "0x12efp-123", "0x12efp-123", ""},
+               {token.FLOAT, "0xABCD.p+0", "0xABCD.p+0", ""},
+               {token.FLOAT, "0x.0189P-0", "0x.0189P-0", ""},
+               {token.FLOAT, "0x1.ffffp+1023", "0x1.ffffp+1023", ""},
+
+               {token.FLOAT, "0x.", "0x.", "hexadecimal literal has no digits"},
+               {token.FLOAT, "0x0.", "0x0.", "hexadecimal mantissa requires a 'p' exponent"},
+               {token.FLOAT, "0x.0", "0x.0", "hexadecimal mantissa requires a 'p' exponent"},
+               {token.FLOAT, "0x1.1", "0x1.1", "hexadecimal mantissa requires a 'p' exponent"},
+               {token.FLOAT, "0x1.1e0", "0x1.1e0", "hexadecimal mantissa requires a 'p' exponent"},
+               {token.FLOAT, "0x1.2gp1a", "0x1.2 gp1a", "hexadecimal mantissa requires a 'p' exponent"},
+               {token.FLOAT, "0x0p", "0x0p", "exponent has no digits"},
+               {token.FLOAT, "0xeP-", "0xeP-", "exponent has no digits"},
+               {token.FLOAT, "0x1234PAB", "0x1234P AB", "exponent has no digits"},
+               {token.FLOAT, "0x1.2p1a", "0x1.2p1 a", ""},
+
+               // hexadecimal imaginaries (invalid)
+               {token.IMAG, "0xf00i", "0xf00i", "invalid suffix 'i' on hexadecimal literal"},
+               {token.IMAG, "0xf00.bap+12i", "0xf00.bap+12i", "invalid suffix 'i' on hexadecimal literal"},
+
+               // separators
+               {token.INT, "0b_1000_0001", "0b_1000_0001", ""},
+               {token.INT, "0o_600", "0o_600", ""},
+               {token.INT, "0_466", "0_466", ""},
+               {token.INT, "1_000", "1_000", ""},
+               {token.FLOAT, "1_000.000_1", "1_000.000_1", ""},
+               {token.IMAG, "10e+1_2_3i", "10e+1_2_3i", ""},
+               {token.INT, "0x_f00d", "0x_f00d", ""},
+               {token.FLOAT, "0x_f00d.0p1_2", "0x_f00d.0p1_2", ""},
+
+               {token.INT, "0b__1000", "0b__1000", "'_' must separate successive digits"},
+               {token.INT, "0o60___0", "0o60___0", "'_' must separate successive digits"},
+               {token.INT, "0466_", "0466_", "'_' must separate successive digits"},
+               {token.FLOAT, "1_.", "1_.", "'_' must separate successive digits"},
+               {token.FLOAT, "0._1", "0._1", "'_' must separate successive digits"},
+               {token.FLOAT, "2.7_e0", "2.7_e0", "'_' must separate successive digits"},
+               {token.IMAG, "10e+12_i", "10e+12_i", "'_' must separate successive digits"},
+               {token.INT, "0x___0", "0x___0", "'_' must separate successive digits"},
+               {token.FLOAT, "0x1.0_p0", "0x1.0_p0", "'_' must separate successive digits"},
+       } {
+               var s Scanner
+               var err string
+               s.Init(fset.AddFile("", fset.Base(), len(test.src)), []byte(test.src), func(_ token.Position, msg string) {
+                       if err == "" {
+                               err = msg
+                       }
+               }, 0)
+               for i, want := range strings.Split(test.tokens, " ") {
+                       err = ""
+                       _, tok, lit := s.Scan()
+
+                       // compute lit where for tokens where lit is not defined
+                       switch tok {
+                       case token.PERIOD:
+                               lit = "."
+                       case token.ADD:
+                               lit = "+"
+                       case token.SUB:
+                               lit = "-"
+                       }
+
+                       if i == 0 {
+                               if tok != test.tok {
+                                       t.Errorf("%q: got token %s; want %s", test.src, tok, test.tok)
+                               }
+                               if err != test.err {
+                                       t.Errorf("%q: got error %q; want %q", test.src, err, test.err)
+                               }
+                       }
+
+                       if lit != want {
+                               t.Errorf("%q: got literal %q (%s); want %s", test.src, lit, tok, want)
+                       }
+               }
+
+               // make sure we read all
+               _, tok, _ := s.Scan()
+               if tok == token.SEMICOLON {
+                       _, tok, _ = s.Scan()
+               }
+               if tok != token.EOF {
+                       t.Errorf("%q: got %s; want EOF", test.src, tok)
+               }
+       }
+}
author	Robert Griesemer <gri@golang.org>
	Mon, 28 Jan 2019 23:29:23 +0000 (15:29 -0800)
committer	Robert Griesemer <gri@golang.org>
	Mon, 11 Feb 2019 23:23:32 +0000 (23:23 +0000)
src/go/scanner/scanner.go		patch \| blob \| history
src/go/scanner/scanner_test.go		patch \| blob \| history