encoding/json: unify encodeState.string and encodeState.stringBytes

author Joe Tsai <joetsai@digital-static.net>

Mon, 20 Feb 2023 01:37:02 +0000 (17:37 -0800)

committer Gopher Robot <gobot@golang.org>

Fri, 24 Feb 2023 19:00:16 +0000 (19:00 +0000)
author Joe Tsai <joetsai@digital-static.net>
Mon, 20 Feb 2023 01:37:02 +0000 (17:37 -0800)
committer Gopher Robot <gobot@golang.org>
Fri, 24 Feb 2023 19:00:16 +0000 (19:00 +0000)
diff --git a/src/encoding/json/encode.go b/src/encoding/json/encode.go

index d2f752a4f8695fe3b855a94d573165d4e6f800e7..f7cfb2b820584b590a9f396186a0506049df185e 100644 (file)
--- a/src/encoding/json/encode.go
+++ b/src/encoding/json/encode.go
@@ -295,6 +295,10 @@ type encodeState struct {
         ptrSeen  map[any]struct{}
  }
  
+func (e *encodeState) AvailableBuffer() []byte {
+       return availableBuffer(&e.Buffer)
+}
+
  const startDetectingCyclesAfter = 1000
  
  var encodeStatePool sync.Pool
@@ -519,7 +523,7 @@ func textMarshalerEncoder(e *encodeState, v reflect.Value, opts encOpts) {
         if err != nil {
                 e.error(&MarshalerError{v.Type(), err, "MarshalText"})
         }
-       e.stringBytes(b, opts.escapeHTML)
+       e.Write(appendString(e.AvailableBuffer(), b, opts.escapeHTML))
  }
  
  func addrTextMarshalerEncoder(e *encodeState, v reflect.Value, opts encOpts) {
@@ -533,7 +537,7 @@ func addrTextMarshalerEncoder(e *encodeState, v reflect.Value, opts encOpts) {
         if err != nil {
                 e.error(&MarshalerError{v.Type(), err, "MarshalText"})
         }
-       e.stringBytes(b, opts.escapeHTML)
+       e.Write(appendString(e.AvailableBuffer(), b, opts.escapeHTML))
  }
  
  func boolEncoder(e *encodeState, v reflect.Value, opts encOpts) {
@@ -639,14 +643,10 @@ func stringEncoder(e *encodeState, v reflect.Value, opts encOpts) {
                 return
         }
         if opts.quoted {
-               e2 := newEncodeState()
-               // Since we encode the string twice, we only need to escape HTML
-               // the first time.
-               e2.string(v.String(), opts.escapeHTML)
-               e.stringBytes(e2.Bytes(), false)
-               encodeStatePool.Put(e2)
+               b := appendString(nil, v.String(), opts.escapeHTML)
+               e.Write(appendString(e.AvailableBuffer(), b, false)) // no need to escape again since it is already escaped
         } else {
-               e.string(v.String(), opts.escapeHTML)
+               e.Write(appendString(e.AvailableBuffer(), v.String(), opts.escapeHTML))
         }
  }
  
@@ -811,7 +811,7 @@ func (me mapEncoder) encode(e *encodeState, v reflect.Value, opts encOpts) {
                 if i > 0 {
                         e.WriteByte(',')
                 }
-               e.string(kv.ks, opts.escapeHTML)
+               e.Write(appendString(e.AvailableBuffer(), kv.ks, opts.escapeHTML))
                 e.WriteByte(':')
                 me.elemEnc(e, kv.v, opts)
         }
@@ -1029,121 +1029,49 @@ func (w *reflectWithString) resolve() error {
         panic("unexpected map key type")
  }
  
-// NOTE: keep in sync with stringBytes below.
-func (e *encodeState) string(s string, escapeHTML bool) {
-       e.WriteByte('"')
+func appendString[Bytes []byte | string](dst []byte, src Bytes, escapeHTML bool) []byte {
+       dst = append(dst, '"')
         start := 0
-       for i := 0; i < len(s); {
-               if b := s[i]; b < utf8.RuneSelf {
+       for i := 0; i < len(src); {
+               if b := src[i]; b < utf8.RuneSelf {
                         if htmlSafeSet[b] || (!escapeHTML && safeSet[b]) {
                                 i++
                                 continue
                         }
-                       if start < i {
-                               e.WriteString(s[start:i])
-                       }
-                       e.WriteByte('\\')
+                       dst = append(dst, src[start:i]...)
                         switch b {
                         case '\\', '"':
-                               e.WriteByte(b)
+                               dst = append(dst, '\\', b)
                         case '\n':
-                               e.WriteByte('n')
+                               dst = append(dst, '\\', 'n')
                         case '\r':
-                               e.WriteByte('r')
+                               dst = append(dst, '\\', 'r')
                         case '\t':
-                               e.WriteByte('t')
+                               dst = append(dst, '\\', 't')
                         default:
                                 // This encodes bytes < 0x20 except for \t, \n and \r.
                                 // If escapeHTML is set, it also escapes <, >, and &
                                 // because they can lead to security holes when
                                 // user-controlled strings are rendered into JSON
                                 // and served to some browsers.
-                               e.WriteString(`u00`)
-                               e.WriteByte(hex[b>>4])
-                               e.WriteByte(hex[b&0xF])
+                               dst = append(dst, '\\', 'u', '0', '0', hex[b>>4], hex[b&0xF])
                         }
                         i++
                         start = i
                         continue
                 }
-               c, size := utf8.DecodeRuneInString(s[i:])
-               if c == utf8.RuneError && size == 1 {
-                       if start < i {
-                               e.WriteString(s[start:i])
-                       }
-                       e.WriteString(`\ufffd`)
-                       i += size
-                       start = i
-                       continue
+               // TODO(https://go.dev/issue/56948): Use generic utf8 functionality.
+               // For now, cast only a small portion of byte slices to a string
+               // so that it can be stack allocated. This slows down []byte slightly
+               // due to the extra copy, but keeps string performance roughly the same.
+               n := len(src) - i
+               if n > utf8.UTFMax {
+                       n = utf8.UTFMax
                 }
-               // U+2028 is LINE SEPARATOR.
-               // U+2029 is PARAGRAPH SEPARATOR.
-               // They are both technically valid characters in JSON strings,
-               // but don't work in JSONP, which has to be evaluated as JavaScript,
-               // and can lead to security holes there. It is valid JSON to
-               // escape them, so we do so unconditionally.
-               // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
-               if c == '\u2028' || c == '\u2029' {
-                       if start < i {
-                               e.WriteString(s[start:i])
-                       }
-                       e.WriteString(`\u202`)
-                       e.WriteByte(hex[c&0xF])
-                       i += size
-                       start = i
-                       continue
-               }
-               i += size
-       }
-       if start < len(s) {
-               e.WriteString(s[start:])
-       }
-       e.WriteByte('"')
-}
-
-// NOTE: keep in sync with string above.
-func (e *encodeState) stringBytes(s []byte, escapeHTML bool) {
-       e.WriteByte('"')
-       start := 0
-       for i := 0; i < len(s); {
-               if b := s[i]; b < utf8.RuneSelf {
-                       if htmlSafeSet[b] || (!escapeHTML && safeSet[b]) {
-                               i++
-                               continue
-                       }
-                       if start < i {
-                               e.Write(s[start:i])
-                       }
-                       e.WriteByte('\\')
-                       switch b {
-                       case '\\', '"':
-                               e.WriteByte(b)
-                       case '\n':
-                               e.WriteByte('n')
-                       case '\r':
-                               e.WriteByte('r')
-                       case '\t':
-                               e.WriteByte('t')
-                       default:
-                               // This encodes bytes < 0x20 except for \t, \n and \r.
-                               // If escapeHTML is set, it also escapes <, >, and &
-                               // because they can lead to security holes when
-                               // user-controlled strings are rendered into JSON
-                               // and served to some browsers.
-                               e.WriteString(`u00`)
-                               e.WriteByte(hex[b>>4])
-                               e.WriteByte(hex[b&0xF])
-                       }
-                       i++
-                       start = i
-                       continue
-               }
-               c, size := utf8.DecodeRune(s[i:])
+               c, size := utf8.DecodeRuneInString(string(src[i : i+n]))
                 if c == utf8.RuneError && size == 1 {
-                       if start < i {
-                               e.Write(s[start:i])
-                       }
-                       e.WriteString(`\ufffd`)
+                       dst = append(dst, src[start:i]...)
+                       dst = append(dst, `\ufffd`...)
                         i += size
                         start = i
                         continue
@@ -1156,21 +1084,17 @@ func (e *encodeState) stringBytes(s []byte, escapeHTML bool) {
                 // escape them, so we do so unconditionally.
                 // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
                 if c == '\u2028' || c == '\u2029' {
-                       if start < i {
-                               e.Write(s[start:i])
-                       }
-                       e.WriteString(`\u202`)
-                       e.WriteByte(hex[c&0xF])
+                       dst = append(dst, src[start:i]...)
+                       dst = append(dst, '\\', 'u', '2', '0', '2', hex[c&0xF])
                         i += size
                         start = i
                         continue
                 }
                 i += size
         }
-       if start < len(s) {
-               e.Write(s[start:])
-       }
-       e.WriteByte('"')
+       dst = append(dst, src[start:]...)
+       dst = append(dst, '"')
+       return dst
  }
  
  // A field represents a single field found in a struct.
diff --git a/src/encoding/json/encode_test.go b/src/encoding/json/encode_test.go

index c1b9ed2676dfa2b7ad51654a689e3d6f45f47150..d027972d8a0a777258935f835f7d25d3a6179f66 100644 (file)
--- a/src/encoding/json/encode_test.go
+++ b/src/encoding/json/encode_test.go
@@ -15,7 +15,6 @@ import (
         "runtime/debug"
         "strconv"
         "testing"
-       "unicode"
  )
  
  type Optionals struct {
@@ -701,54 +700,6 @@ func TestDuplicatedFieldDisappears(t *testing.T) {
         }
  }
  
-func TestStringBytes(t *testing.T) {
-       t.Parallel()
-       // Test that encodeState.stringBytes and encodeState.string use the same encoding.
-       var r []rune
-       for i := '\u0000'; i <= unicode.MaxRune; i++ {
-               if testing.Short() && i > 1000 {
-                       i = unicode.MaxRune
-               }
-               r = append(r, i)
-       }
-       s := string(r) + "\xff\xff\xffhello" // some invalid UTF-8 too
-
-       for _, escapeHTML := range []bool{true, false} {
-               es := &encodeState{}
-               es.string(s, escapeHTML)
-
-               esBytes := &encodeState{}
-               esBytes.stringBytes([]byte(s), escapeHTML)
-
-               enc := es.Buffer.String()
-               encBytes := esBytes.Buffer.String()
-               if enc != encBytes {
-                       i := 0
-                       for i < len(enc) && i < len(encBytes) && enc[i] == encBytes[i] {
-                               i++
-                       }
-                       enc = enc[i:]
-                       encBytes = encBytes[i:]
-                       i = 0
-                       for i < len(enc) && i < len(encBytes) && enc[len(enc)-i-1] == encBytes[len(encBytes)-i-1] {
-                               i++
-                       }
-                       enc = enc[:len(enc)-i]
-                       encBytes = encBytes[:len(encBytes)-i]
-
-                       if len(enc) > 20 {
-                               enc = enc[:20] + "..."
-                       }
-                       if len(encBytes) > 20 {
-                               encBytes = encBytes[:20] + "..."
-                       }
-
-                       t.Errorf("with escapeHTML=%t, encodings differ at %#q vs %#q",
-                               escapeHTML, enc, encBytes)
-               }
-       }
-}
-
  func TestIssue10281(t *testing.T) {
         type Foo struct {
                 N Number
author	Joe Tsai <joetsai@digital-static.net>
	Mon, 20 Feb 2023 01:37:02 +0000 (17:37 -0800)
committer	Gopher Robot <gobot@golang.org>
	Fri, 24 Feb 2023 19:00:16 +0000 (19:00 +0000)
src/encoding/json/encode.go		patch \| blob \| history
src/encoding/json/encode_test.go		patch \| blob \| history