encoding/json: avoid work when unquoting strings, take 2

author Daniel Martí <mvdan@mvdan.cc>

Wed, 21 Aug 2019 16:22:24 +0000 (18:22 +0200)

committer Daniel Martí <mvdan@mvdan.cc>

Thu, 31 Oct 2019 12:02:57 +0000 (12:02 +0000)
author Daniel Martí <mvdan@mvdan.cc>
Wed, 21 Aug 2019 16:22:24 +0000 (18:22 +0200)
committer Daniel Martí <mvdan@mvdan.cc>
Thu, 31 Oct 2019 12:02:57 +0000 (12:02 +0000)
diff --git a/src/encoding/json/decode.go b/src/encoding/json/decode.go

index 86d8a69db7e6d76aa40ca9d4b307ea2f43e909b7..b43484692e18565d453fa7c8b364435bf7e56803 100644 (file)
--- a/src/encoding/json/decode.go
+++ b/src/encoding/json/decode.go
@@ -213,6 +213,9 @@ type decodeState struct {
         savedError            error
         useNumber             bool
         disallowUnknownFields bool
+       // safeUnquote is the number of current string literal bytes that don't
+       // need to be unquoted. When negative, no bytes need unquoting.
+       safeUnquote int
  }
  
  // readIndex returns the position of the last byte read.
@@ -314,13 +317,27 @@ func (d *decodeState) rescanLiteral() {
  Switch:
         switch data[i-1] {
         case '"': // string
+               // safeUnquote is initialized at -1, which means that all bytes
+               // checked so far can be unquoted at a later time with no work
+               // at all. When reaching the closing '"', if safeUnquote is
+               // still -1, all bytes can be unquoted with no work. Otherwise,
+               // only those bytes up until the first '\\' or non-ascii rune
+               // can be safely unquoted.
+               safeUnquote := -1
                 for ; i < len(data); i++ {
-                       switch data[i] {
-                       case '\\':
+                       if c := data[i]; c == '\\' {
+                               if safeUnquote < 0 { // first unsafe byte
+                                       safeUnquote = int(i - d.off)
+                               }
                                 i++ // escaped char
-                       case '"':
+                       } else if c == '"' {
+                               d.safeUnquote = safeUnquote
                                 i++ // tokenize the closing quote too
                                 break Switch
+                       } else if c >= utf8.RuneSelf {
+                               if safeUnquote < 0 { // first unsafe byte
+                                       safeUnquote = int(i - d.off)
+                               }
                         }
                 }
         case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-': // number
@@ -674,7 +691,7 @@ func (d *decodeState) object(v reflect.Value) error {
                 start := d.readIndex()
                 d.rescanLiteral()
                 item := d.data[start:d.readIndex()]
-               key, ok := unquoteBytes(item)
+               key, ok := d.unquoteBytes(item)
                 if !ok {
                         panic(phasePanicMsg)
                 }
@@ -875,7 +892,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
                         d.saveError(&UnmarshalTypeError{Value: val, Type: v.Type(), Offset: int64(d.readIndex())})
                         return nil
                 }
-               s, ok := unquoteBytes(item)
+               s, ok := d.unquoteBytes(item)
                 if !ok {
                         if fromQuoted {
                                 return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -926,7 +943,7 @@ func (d *decodeState) literalStore(item []byte, v reflect.Value, fromQuoted bool
                 }
  
         case '"': // string
-               s, ok := unquoteBytes(item)
+               s, ok := d.unquoteBytes(item)
                 if !ok {
                         if fromQuoted {
                                 return fmt.Errorf("json: invalid use of ,string struct tag, trying to unmarshal %q into %v", item, v.Type())
@@ -1086,7 +1103,7 @@ func (d *decodeState) objectInterface() map[string]interface{} {
                 start := d.readIndex()
                 d.rescanLiteral()
                 item := d.data[start:d.readIndex()]
-               key, ok := unquote(item)
+               key, ok := d.unquote(item)
                 if !ok {
                         panic(phasePanicMsg)
                 }
@@ -1135,7 +1152,7 @@ func (d *decodeState) literalInterface() interface{} {
                 return c == 't'
  
         case '"': // string
-               s, ok := unquote(item)
+               s, ok := d.unquote(item)
                 if !ok {
                         panic(phasePanicMsg)
                 }
@@ -1178,38 +1195,26 @@ func getu4(s []byte) rune {
  
  // unquote converts a quoted JSON string literal s into an actual string t.
  // The rules are different than for Go, so cannot use strconv.Unquote.
-func unquote(s []byte) (t string, ok bool) {
-       s, ok = unquoteBytes(s)
+// The first byte in s must be '"'.
+func (d *decodeState) unquote(s []byte) (t string, ok bool) {
+       s, ok = d.unquoteBytes(s)
         t = string(s)
         return
  }
  
-func unquoteBytes(s []byte) (t []byte, ok bool) {
-       if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' {
+func (d *decodeState) unquoteBytes(s []byte) (t []byte, ok bool) {
+       // We already know that s[0] == '"'. However, we don't know that the
+       // closing quote exists in all cases, such as when the string is nested
+       // via the ",string" option.
+       if len(s) < 2 || s[len(s)-1] != '"' {
                 return
         }
         s = s[1 : len(s)-1]
  
-       // Check for unusual characters. If there are none,
-       // then no unquoting is needed, so return a slice of the
-       // original bytes.
-       r := 0
-       for r < len(s) {
-               c := s[r]
-               if c == '\\' || c == '"' || c < ' ' {
-                       break
-               }
-               if c < utf8.RuneSelf {
-                       r++
-                       continue
-               }
-               rr, size := utf8.DecodeRune(s[r:])
-               if rr == utf8.RuneError && size == 1 {
-                       break
-               }
-               r += size
-       }
-       if r == len(s) {
+       // If there are no unusual characters, no unquoting is needed, so return
+       // a slice of the original bytes.
+       r := d.safeUnquote
+       if r == -1 {
                 return s, true
         }
author	Daniel Martí <mvdan@mvdan.cc>
	Wed, 21 Aug 2019 16:22:24 +0000 (18:22 +0200)
committer	Daniel Martí <mvdan@mvdan.cc>
	Thu, 31 Oct 2019 12:02:57 +0000 (12:02 +0000)