]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/yacc: allow utf-8 token values
authorRob Pike <r@golang.org>
Thu, 13 Sep 2012 20:59:00 +0000 (13:59 -0700)
committerRob Pike <r@golang.org>
Thu, 13 Sep 2012 20:59:00 +0000 (13:59 -0700)
Also clean up the code and allow \U.
Fixes #3007.

R=golang-dev, rsc, 0xjnml
CC=golang-dev
https://golang.org/cl/6492105

src/cmd/yacc/units.y
src/cmd/yacc/yacc.go

index 32d37e50394cd57c8453b6e4ec1b59fb4f122381..00ccaf2ecece23f5d570b8cfa0cadb8a833a4522 100644 (file)
@@ -76,7 +76,7 @@ var vflag bool
 
 %type  <node>  prog expr expr0 expr1 expr2 expr3 expr4
 
-%token <vval>  VAL
+%token <vval>  VÄL // dieresis to test UTF-8
 %token <vvar>  VAR
 %token <numb>  _SUP // tests leading underscore in token name
 %%
@@ -199,7 +199,7 @@ expr0:
                        $$ = $1.node
                }
        }
-|      VAL
+|      VÄL
        {
                $$ = one
                $$.vval = $1
@@ -275,7 +275,7 @@ numb:
                f = 0
        }
        yylval.vval = f
-       return VAL
+       return VÄL
 }
 
 func (UnitsLex) Error(s string) {
index a4ae35349a28c95d4beb1e2c151991cc7be147ef..25bd2229871a201eabe5ffaa4414d62aa667f9f4 100644 (file)
@@ -52,6 +52,7 @@ import (
        "os"
        "strings"
        "unicode"
+       "unicode/utf8"
 )
 
 // the following are adjustable
@@ -326,7 +327,6 @@ var resrv = []Resrv{
 var zznewstate = 0
 
 const EOF = -1
-const UTFmax = 0x3f
 
 func main() {
 
@@ -719,8 +719,8 @@ func moreprod() {
 }
 
 //
-// define s to be a terminal if t=0
-// or a nonterminal if t=1
+// define s to be a terminal if nt==0
+// or a nonterminal if nt==1
 //
 func defin(nt int, s string) int {
        val := 0
@@ -753,56 +753,66 @@ func defin(nt int, s string) int {
 
        // establish value for token
        // single character literal
-       if s[0] == ' ' && len(s) == 1+1 {
-               val = int(s[1])
-       } else if s[0] == ' ' && s[1] == '\\' { // escape sequence
-               if len(s) == 2+1 {
-                       // single character escape sequence
-                       switch s[2] {
-                       case '\'':
-                               val = '\''
-                       case '"':
-                               val = '"'
-                       case '\\':
-                               val = '\\'
-                       case 'a':
-                               val = '\a'
-                       case 'b':
-                               val = '\b'
-                       case 'n':
-                               val = '\n'
-                       case 'r':
-                               val = '\r'
-                       case 't':
-                               val = '\t'
-                       case 'v':
-                               val = '\v'
-                       default:
-                               errorf("invalid escape %v", s[1:3])
-                       }
-               } else if s[2] == 'u' && len(s) == 2+1+4 { // \unnnn sequence
-                       val = 0
-                       s = s[3:]
-                       for s != "" {
-                               c := int(s[0])
-                               switch {
-                               case c >= '0' && c <= '9':
-                                       c -= '0'
-                               case c >= 'a' && c <= 'f':
-                                       c -= 'a' - 10
-                               case c >= 'A' && c <= 'F':
-                                       c -= 'A' - 10
+       if s[0] == ' ' {
+               s = s[1:]
+               r, size := utf8.DecodeRuneInString(s)
+               if r == utf8.RuneError && size == 1 {
+                       errorf("invalid UTF-8 sequence %q", s)
+               }
+               val = int(r)
+               if val == '\\' { // escape sequence
+                       switch {
+                       case len(s) == 2:
+                               // single character escape sequence
+                               switch s[1] {
+                               case '\'':
+                                       val = '\''
+                               case '"':
+                                       val = '"'
+                               case '\\':
+                                       val = '\\'
+                               case 'a':
+                                       val = '\a'
+                               case 'b':
+                                       val = '\b'
+                               case 'f':
+                                       val = '\f'
+                               case 'n':
+                                       val = '\n'
+                               case 'r':
+                                       val = '\r'
+                               case 't':
+                                       val = '\t'
+                               case 'v':
+                                       val = '\v'
                                default:
-                                       errorf("illegal \\unnnn construction")
+                                       errorf("invalid escape %s", s)
                                }
-                               val = val*16 + c
-                               s = s[1:]
-                       }
-                       if val == 0 {
-                               errorf("'\\u0000' is illegal")
+                       case s[1] == 'u' && len(s) == 2+4, // \unnnn sequence
+                               s[1] == 'U' && len(s) == 2+8: // \Unnnnnnnn sequence
+                               val = 0
+                               s = s[2:]
+                               for s != "" {
+                                       c := int(s[0])
+                                       switch {
+                                       case c >= '0' && c <= '9':
+                                               c -= '0'
+                                       case c >= 'a' && c <= 'f':
+                                               c -= 'a' - 10
+                                       case c >= 'A' && c <= 'F':
+                                               c -= 'A' - 10
+                                       default:
+                                               errorf(`illegal \u or \U construction`)
+                                       }
+                                       val = val*16 + c
+                                       s = s[1:]
+                               }
+                       default:
+                               errorf("invalid escape %s", s)
                        }
-               } else {
-                       errorf("unknown escape")
+               }
+               if val == 0 {
+                       errorf("token value 0 is illegal")
                }
        } else {
                val = extval