From bfebf9ea8071683af608b8bf291fc7d8365d501b Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Fri, 26 Sep 2014 17:03:31 -0400 Subject: [PATCH] cmd/yacc: fix parsing of character tokens From issue 7967 I learned: 1) yacc accepts either 'x' or "x" to mean token value 0x78 2) yacc also accepts 'xyz' and "XYZ" to mean token value 0x78 Use strconv.Unquote to simplify the handling of quoted strings and check that each has only one rune. Although this does clean things up, it makes 'x' and "x" treated as different internally (now they are stored as `'x'` and `"x"`; before they were both ` x`). Grammars that use both interchangeably will now die with an error similar to the one from issue 7967: yacc bug -- cannot have 2 different Ts with same value "+" and '+' The echoing of the quotes should make clear what is going on. The other semantic change caused by using strconv.Unquote is that '\"' and "\'" are no longer valid. Like in Go, they must be spelled without the backslash: '"' and "'". On the other hand, now yacc and Go agree about what character and string literals mean. LGTM=r R=r CC=golang-codereviews https://golang.org/cl/149110043 --- src/cmd/yacc/yacc.go | 76 ++++++++------------------------------------ 1 file changed, 13 insertions(+), 63 deletions(-) diff --git a/src/cmd/yacc/yacc.go b/src/cmd/yacc/yacc.go index c53403266e..0761811cf4 100644 --- a/src/cmd/yacc/yacc.go +++ b/src/cmd/yacc/yacc.go @@ -52,9 +52,9 @@ import ( "go/format" "io/ioutil" "os" + "strconv" "strings" "unicode" - "unicode/utf8" ) // the following are adjustable @@ -756,64 +756,16 @@ func defin(nt int, s string) int { // establish value for token // single character literal - if s[0] == ' ' { - s = s[1:] - r, size := utf8.DecodeRuneInString(s) - if r == utf8.RuneError && size == 1 { - errorf("invalid UTF-8 sequence %q", s) - } - val = int(r) - if val == '\\' { // escape sequence - switch { - case len(s) == 2: - // single character escape sequence - switch s[1] { - case '\'': - val = '\'' - case '"': - val = '"' - case '\\': - val = '\\' - case 'a': - val = '\a' - case 'b': - val = '\b' - case 'f': - val = '\f' - case 'n': - val = '\n' - case 'r': - val = '\r' - case 't': - val = '\t' - case 'v': - val = '\v' - default: - errorf("invalid escape %s", s) - } - case s[1] == 'u' && len(s) == 2+4, // \unnnn sequence - s[1] == 'U' && len(s) == 2+8: // \Unnnnnnnn sequence - val = 0 - s = s[2:] - for s != "" { - c := int(s[0]) - switch { - case c >= '0' && c <= '9': - c -= '0' - case c >= 'a' && c <= 'f': - c -= 'a' - 10 - case c >= 'A' && c <= 'F': - c -= 'A' - 10 - default: - errorf(`illegal \u or \U construction`) - } - val = val*16 + c - s = s[1:] - } - default: - errorf("invalid escape %s", s) - } + if s[0] == '\'' || s[0] == '"' { + q, err := strconv.Unquote(s) + if err != nil { + errorf("invalid token: %s", err) + } + rq := []rune(q) + if len(rq) != 1 { + errorf("character token too long: %s", s) } + val = int(rq[0]) if val == 0 { errorf("token value 0 is illegal") } @@ -896,7 +848,7 @@ func gettok() int { case '"', '\'': match = c - tokname = " " + tokname = string(c) for { c = getrune(finput) if c == '\n' || c == EOF { @@ -909,6 +861,7 @@ func gettok() int { if tokflag { fmt.Printf(">>> IDENTIFIER \"%v\" %v\n", tokname, lineno) } + tokname += string(c) return IDENTIFIER } tokname += string(c) @@ -1029,7 +982,7 @@ func fdtype(t int) int { } func chfind(t int, s string) int { - if s[0] == ' ' { + if s[0] == '"' || s[0] == '\'' { t = 0 } for i := 0; i <= ntokens; i++ { @@ -1516,9 +1469,6 @@ func symnam(i int) string { } else { s = tokset[i].name } - if s[0] == ' ' { - s = s[1:] - } return s } -- 2.48.1