From e2f413402527505144beea443078649380e0c545 Mon Sep 17 00:00:00 2001
From: Joe Tsai <joetsai@digital-static.net>
Date: Sun, 19 Feb 2023 17:37:02 -0800
Subject: [PATCH] encoding/json: unify encodeState.string and
 encodeState.stringBytes
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This is part of the effort to reduce direct reliance on bytes.Buffer
so that we can use a buffer with better pooling characteristics.

Unify these two methods as a single version that uses generics
to reduce duplicated logic. Unfortunately, we lack a generic
version of utf8.DecodeRune (see #56948), so we cast []byte to string.
The []byte variant is slightly slower for multi-byte unicode since
casting results in a stack-allocated copy operation.
Fortunately, this code path is used only for TextMarshalers.
We can also delete TestStringBytes, which exists to ensure
that the two duplicate implementations remain in sync.

Performance:

    name              old time/op    new time/op    delta
    CodeEncoder          399Âµs Â± 2%     409Âµs Â± 2%   +2.59%  (p=0.000 n=9+9)
    CodeEncoderError     450Âµs Â± 1%     451Âµs Â± 2%     ~     (p=0.684 n=10+10)
    CodeMarshal          553Âµs Â± 2%     562Âµs Â± 3%     ~     (p=0.075 n=10+10)
    CodeMarshalError     733Âµs Â± 3%     737Âµs Â± 2%     ~     (p=0.400 n=9+10)
    EncodeMarshaler     24.9ns Â±12%    24.1ns Â±13%     ~     (p=0.190 n=10+10)
    EncoderEncode       12.3ns Â± 3%    14.7ns Â±20%     ~     (p=0.315 n=8+10)

    name              old speed      new speed      delta
    CodeEncoder       4.87GB/s Â± 2%  4.74GB/s Â± 2%   -2.53%  (p=0.000 n=9+9)
    CodeEncoderError  4.31GB/s Â± 1%  4.30GB/s Â± 2%     ~     (p=0.684 n=10+10)
    CodeMarshal       3.51GB/s Â± 2%  3.46GB/s Â± 3%     ~     (p=0.075 n=10+10)
    CodeMarshalError  2.65GB/s Â± 3%  2.63GB/s Â± 2%     ~     (p=0.400 n=9+10)

    name              old alloc/op   new alloc/op   delta
    CodeEncoder          327B Â±347%     447B Â±232%  +36.93%  (p=0.034 n=9+10)
    CodeEncoderError      142B Â± 1%      143B Â± 0%     ~     (p=1.000 n=8+7)
    CodeMarshal         1.96MB Â± 2%    1.96MB Â± 2%     ~     (p=0.468 n=10+10)
    CodeMarshalError    2.04MB Â± 3%    2.03MB Â± 1%     ~     (p=0.971 n=10+10)
    EncodeMarshaler      4.00B Â± 0%     4.00B Â± 0%     ~     (all equal)
    EncoderEncode        0.00B          0.00B          ~     (all equal)

    name              old allocs/op  new allocs/op  delta
    CodeEncoder           0.00           0.00          ~     (all equal)
    CodeEncoderError      4.00 Â± 0%      4.00 Â± 0%     ~     (all equal)
    CodeMarshal           1.00 Â± 0%      1.00 Â± 0%     ~     (all equal)
    CodeMarshalError      6.00 Â± 0%      6.00 Â± 0%     ~     (all equal)
    EncodeMarshaler       1.00 Â± 0%      1.00 Â± 0%     ~     (all equal)
    EncoderEncode         0.00           0.00          ~     (all equal)

There is a very slight performance degradation for CodeEncoder
due to an increase in allocation sizes. However, the number of allocations
did not change. This is likely due to remote effects of the growth rate
differences between bytes.Buffer and the builtin append function.
We shouldn't overly rely on the growth rate of bytes.Buffer anyways
since that is subject to possibly change in #51462.
As the benchtime increases, the alloc/op goes down indicating
that the amortized memory cost is fixed.

Updates #27735

Change-Id: Ie35e480e292fe082d7986e0a4d81212c1d4202b3
Reviewed-on: https://go-review.googlesource.com/c/go/+/469556
Run-TryBot: Joseph Tsai <joetsai@digital-static.net>
Reviewed-by: Bryan Mills <bcmills@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Daniel MartÃ­ <mvdan@mvdan.cc>
Auto-Submit: Joseph Tsai <joetsai@digital-static.net>
---
 src/encoding/json/encode.go      | 146 ++++++++-----------------------
 src/encoding/json/encode_test.go |  49 -----------
 2 files changed, 35 insertions(+), 160 deletions(-)

diff --git a/src/encoding/json/encode.go b/src/encoding/json/encode.go
index d2f752a4f8..f7cfb2b820 100644
--- a/src/encoding/json/encode.go
+++ b/src/encoding/json/encode.go
@@ -295,6 +295,10 @@ type encodeState struct {
 	ptrSeen  map[any]struct{}
 }
 
+func (e *encodeState) AvailableBuffer() []byte {
+	return availableBuffer(&e.Buffer)
+}
+
 const startDetectingCyclesAfter = 1000
 
 var encodeStatePool sync.Pool
@@ -519,7 +523,7 @@ func textMarshalerEncoder(e *encodeState, v reflect.Value, opts encOpts) {
 	if err != nil {
 		e.error(&MarshalerError{v.Type(), err, "MarshalText"})
 	}
-	e.stringBytes(b, opts.escapeHTML)
+	e.Write(appendString(e.AvailableBuffer(), b, opts.escapeHTML))
 }
 
 func addrTextMarshalerEncoder(e *encodeState, v reflect.Value, opts encOpts) {
@@ -533,7 +537,7 @@ func addrTextMarshalerEncoder(e *encodeState, v reflect.Value, opts encOpts) {
 	if err != nil {
 		e.error(&MarshalerError{v.Type(), err, "MarshalText"})
 	}
-	e.stringBytes(b, opts.escapeHTML)
+	e.Write(appendString(e.AvailableBuffer(), b, opts.escapeHTML))
 }
 
 func boolEncoder(e *encodeState, v reflect.Value, opts encOpts) {
@@ -639,14 +643,10 @@ func stringEncoder(e *encodeState, v reflect.Value, opts encOpts) {
 		return
 	}
 	if opts.quoted {
-		e2 := newEncodeState()
-		// Since we encode the string twice, we only need to escape HTML
-		// the first time.
-		e2.string(v.String(), opts.escapeHTML)
-		e.stringBytes(e2.Bytes(), false)
-		encodeStatePool.Put(e2)
+		b := appendString(nil, v.String(), opts.escapeHTML)
+		e.Write(appendString(e.AvailableBuffer(), b, false)) // no need to escape again since it is already escaped
 	} else {
-		e.string(v.String(), opts.escapeHTML)
+		e.Write(appendString(e.AvailableBuffer(), v.String(), opts.escapeHTML))
 	}
 }
 
@@ -811,7 +811,7 @@ func (me mapEncoder) encode(e *encodeState, v reflect.Value, opts encOpts) {
 		if i > 0 {
 			e.WriteByte(',')
 		}
-		e.string(kv.ks, opts.escapeHTML)
+		e.Write(appendString(e.AvailableBuffer(), kv.ks, opts.escapeHTML))
 		e.WriteByte(':')
 		me.elemEnc(e, kv.v, opts)
 	}
@@ -1029,121 +1029,49 @@ func (w *reflectWithString) resolve() error {
 	panic("unexpected map key type")
 }
 
-// NOTE: keep in sync with stringBytes below.
-func (e *encodeState) string(s string, escapeHTML bool) {
-	e.WriteByte('"')
+func appendString[Bytes []byte | string](dst []byte, src Bytes, escapeHTML bool) []byte {
+	dst = append(dst, '"')
 	start := 0
-	for i := 0; i < len(s); {
-		if b := s[i]; b < utf8.RuneSelf {
+	for i := 0; i < len(src); {
+		if b := src[i]; b < utf8.RuneSelf {
 			if htmlSafeSet[b] || (!escapeHTML && safeSet[b]) {
 				i++
 				continue
 			}
-			if start < i {
-				e.WriteString(s[start:i])
-			}
-			e.WriteByte('\\')
+			dst = append(dst, src[start:i]...)
 			switch b {
 			case '\\', '"':
-				e.WriteByte(b)
+				dst = append(dst, '\\', b)
 			case '\n':
-				e.WriteByte('n')
+				dst = append(dst, '\\', 'n')
 			case '\r':
-				e.WriteByte('r')
+				dst = append(dst, '\\', 'r')
 			case '\t':
-				e.WriteByte('t')
+				dst = append(dst, '\\', 't')
 			default:
 				// This encodes bytes < 0x20 except for \t, \n and \r.
 				// If escapeHTML is set, it also escapes <, >, and &
 				// because they can lead to security holes when
 				// user-controlled strings are rendered into JSON
 				// and served to some browsers.
-				e.WriteString(`u00`)
-				e.WriteByte(hex[b>>4])
-				e.WriteByte(hex[b&0xF])
+				dst = append(dst, '\\', 'u', '0', '0', hex[b>>4], hex[b&0xF])
 			}
 			i++
 			start = i
 			continue
 		}
-		c, size := utf8.DecodeRuneInString(s[i:])
-		if c == utf8.RuneError && size == 1 {
-			if start < i {
-				e.WriteString(s[start:i])
-			}
-			e.WriteString(`\ufffd`)
-			i += size
-			start = i
-			continue
+		// TODO(https://go.dev/issue/56948): Use generic utf8 functionality.
+		// For now, cast only a small portion of byte slices to a string
+		// so that it can be stack allocated. This slows down []byte slightly
+		// due to the extra copy, but keeps string performance roughly the same.
+		n := len(src) - i
+		if n > utf8.UTFMax {
+			n = utf8.UTFMax
 		}
-		// U+2028 is LINE SEPARATOR.
-		// U+2029 is PARAGRAPH SEPARATOR.
-		// They are both technically valid characters in JSON strings,
-		// but don't work in JSONP, which has to be evaluated as JavaScript,
-		// and can lead to security holes there. It is valid JSON to
-		// escape them, so we do so unconditionally.
-		// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
-		if c == '\u2028' || c == '\u2029' {
-			if start < i {
-				e.WriteString(s[start:i])
-			}
-			e.WriteString(`\u202`)
-			e.WriteByte(hex[c&0xF])
-			i += size
-			start = i
-			continue
-		}
-		i += size
-	}
-	if start < len(s) {
-		e.WriteString(s[start:])
-	}
-	e.WriteByte('"')
-}
-
-// NOTE: keep in sync with string above.
-func (e *encodeState) stringBytes(s []byte, escapeHTML bool) {
-	e.WriteByte('"')
-	start := 0
-	for i := 0; i < len(s); {
-		if b := s[i]; b < utf8.RuneSelf {
-			if htmlSafeSet[b] || (!escapeHTML && safeSet[b]) {
-				i++
-				continue
-			}
-			if start < i {
-				e.Write(s[start:i])
-			}
-			e.WriteByte('\\')
-			switch b {
-			case '\\', '"':
-				e.WriteByte(b)
-			case '\n':
-				e.WriteByte('n')
-			case '\r':
-				e.WriteByte('r')
-			case '\t':
-				e.WriteByte('t')
-			default:
-				// This encodes bytes < 0x20 except for \t, \n and \r.
-				// If escapeHTML is set, it also escapes <, >, and &
-				// because they can lead to security holes when
-				// user-controlled strings are rendered into JSON
-				// and served to some browsers.
-				e.WriteString(`u00`)
-				e.WriteByte(hex[b>>4])
-				e.WriteByte(hex[b&0xF])
-			}
-			i++
-			start = i
-			continue
-		}
-		c, size := utf8.DecodeRune(s[i:])
+		c, size := utf8.DecodeRuneInString(string(src[i : i+n]))
 		if c == utf8.RuneError && size == 1 {
-			if start < i {
-				e.Write(s[start:i])
-			}
-			e.WriteString(`\ufffd`)
+			dst = append(dst, src[start:i]...)
+			dst = append(dst, `\ufffd`...)
 			i += size
 			start = i
 			continue
@@ -1156,21 +1084,17 @@ func (e *encodeState) stringBytes(s []byte, escapeHTML bool) {
 		// escape them, so we do so unconditionally.
 		// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
 		if c == '\u2028' || c == '\u2029' {
-			if start < i {
-				e.Write(s[start:i])
-			}
-			e.WriteString(`\u202`)
-			e.WriteByte(hex[c&0xF])
+			dst = append(dst, src[start:i]...)
+			dst = append(dst, '\\', 'u', '2', '0', '2', hex[c&0xF])
 			i += size
 			start = i
 			continue
 		}
 		i += size
 	}
-	if start < len(s) {
-		e.Write(s[start:])
-	}
-	e.WriteByte('"')
+	dst = append(dst, src[start:]...)
+	dst = append(dst, '"')
+	return dst
 }
 
 // A field represents a single field found in a struct.
diff --git a/src/encoding/json/encode_test.go b/src/encoding/json/encode_test.go
index c1b9ed2676..d027972d8a 100644
--- a/src/encoding/json/encode_test.go
+++ b/src/encoding/json/encode_test.go
@@ -15,7 +15,6 @@ import (
 	"runtime/debug"
 	"strconv"
 	"testing"
-	"unicode"
 )
 
 type Optionals struct {
@@ -701,54 +700,6 @@ func TestDuplicatedFieldDisappears(t *testing.T) {
 	}
 }
 
-func TestStringBytes(t *testing.T) {
-	t.Parallel()
-	// Test that encodeState.stringBytes and encodeState.string use the same encoding.
-	var r []rune
-	for i := '\u0000'; i <= unicode.MaxRune; i++ {
-		if testing.Short() && i > 1000 {
-			i = unicode.MaxRune
-		}
-		r = append(r, i)
-	}
-	s := string(r) + "\xff\xff\xffhello" // some invalid UTF-8 too
-
-	for _, escapeHTML := range []bool{true, false} {
-		es := &encodeState{}
-		es.string(s, escapeHTML)
-
-		esBytes := &encodeState{}
-		esBytes.stringBytes([]byte(s), escapeHTML)
-
-		enc := es.Buffer.String()
-		encBytes := esBytes.Buffer.String()
-		if enc != encBytes {
-			i := 0
-			for i < len(enc) && i < len(encBytes) && enc[i] == encBytes[i] {
-				i++
-			}
-			enc = enc[i:]
-			encBytes = encBytes[i:]
-			i = 0
-			for i < len(enc) && i < len(encBytes) && enc[len(enc)-i-1] == encBytes[len(encBytes)-i-1] {
-				i++
-			}
-			enc = enc[:len(enc)-i]
-			encBytes = encBytes[:len(encBytes)-i]
-
-			if len(enc) > 20 {
-				enc = enc[:20] + "..."
-			}
-			if len(encBytes) > 20 {
-				encBytes = encBytes[:20] + "..."
-			}
-
-			t.Errorf("with escapeHTML=%t, encodings differ at %#q vs %#q",
-				escapeHTML, enc, encBytes)
-		}
-	}
-}
-
 func TestIssue10281(t *testing.T) {
 	type Foo struct {
 		N Number
-- 
2.51.0