]> Cypherpunks repositories - gostls13.git/commitdiff
encoding/json: simplify folded name logic
authorJoe Tsai <joetsai@digital-static.net>
Mon, 20 Feb 2023 19:26:10 +0000 (11:26 -0800)
committerGopher Robot <gobot@golang.org>
Mon, 27 Feb 2023 17:37:27 +0000 (17:37 +0000)
The folded name logic (despite all attempts to optimize it)
was fundamentally an O(n) operation where every field in a struct
needed to be linearly scanned in order to find a match.
This made unmashaling of unknown fields always O(n).
Instead of optimizing the comparison for each field,
make it such that we can look up a name in O(1).

We accomplish this by maintaining a map keyed by pre-folded names,
which we can pre-calculate when processing the struct type.
Using a stack-allocated buffer, we can fold the input name and
look up its presence in the map.

Also, instead of mapping from names to indexes,
map directly to a pointer to the field information.
The memory cost of this is the same and avoids an extra slice index.

The new logic is both simpler and faster.

Performance:

name                   old time/op    new time/op    delta
CodeDecoder           2.47ms ± 4%    2.42ms ± 2%  -1.83%  (p=0.022 n=10+9)
UnicodeDecoder         259ns ± 2%     248ns ± 1%  -4.32%  (p=0.000 n=10+10)
DecoderStream          150ns ± 1%     149ns ± 1%    ~     (p=0.516 n=10+10)
CodeUnmarshal         3.13ms ± 2%    3.09ms ± 2%  -1.37%  (p=0.022 n=10+9)
CodeUnmarshalReuse    2.50ms ± 1%    2.45ms ± 1%  -1.96%  (p=0.001 n=8+9)
UnmarshalString       67.1ns ± 5%    64.5ns ± 5%  -3.90%  (p=0.005 n=10+10)
UnmarshalFloat64      60.1ns ± 4%    58.4ns ± 2%  -2.89%  (p=0.002 n=10+8)
UnmarshalInt64        51.0ns ± 4%    49.2ns ± 1%  -3.53%  (p=0.001 n=10+8)
Issue10335            80.7ns ± 2%    79.2ns ± 1%  -1.82%  (p=0.016 n=10+8)
Issue34127            28.6ns ± 3%    28.8ns ± 3%    ~     (p=0.388 n=9+10)
Unmapped               177ns ± 2%     177ns ± 2%    ~     (p=0.956 n=10+10)

Change-Id: I478b2b958f5a63a69c9a991a39cd5ffb43244a2a
Reviewed-on: https://go-review.googlesource.com/c/go/+/471196
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Run-TryBot: Joseph Tsai <joetsai@digital-static.net>
Auto-Submit: Joseph Tsai <joetsai@digital-static.net>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Johan Brandhorst-Satzkorn <johan.brandhorst@gmail.com>
Reviewed-by: Than McIntosh <thanm@google.com>
Reviewed-by: Daniel Martí <mvdan@mvdan.cc>
src/encoding/json/decode.go
src/encoding/json/encode.go
src/encoding/json/fold.go
src/encoding/json/fold_test.go

index 0a1cadf42edb3fce7cecb09ed7dbcd33b07afb48..53470d8c88e4ab1bdb9da48f76b63590da4e0177 100644 (file)
@@ -690,20 +690,9 @@ func (d *decodeState) object(v reflect.Value) error {
                        }
                        subv = mapElem
                } else {
-                       var f *field
-                       if i, ok := fields.nameIndex[string(key)]; ok {
-                               // Found an exact name match.
-                               f = &fields.list[i]
-                       } else {
-                               // Fall back to the expensive case-insensitive
-                               // linear search.
-                               for i := range fields.list {
-                                       ff := &fields.list[i]
-                                       if ff.equalFold(ff.nameBytes, key) {
-                                               f = ff
-                                               break
-                                       }
-                               }
+                       f := fields.byExactName[string(key)]
+                       if f == nil {
+                               f = fields.byFoldedName[string(foldName(key))]
                        }
                        if f != nil {
                                subv = v
index de639aa008aa278c5720ba417f2323a3b2125fa2..f3c824d13e0d2dc52918b67aeb6cf69807b8adbf 100644 (file)
@@ -672,8 +672,9 @@ type structEncoder struct {
 }
 
 type structFields struct {
-       list      []field
-       nameIndex map[string]int
+       list         []field
+       byExactName  map[string]*field
+       byFoldedName map[string]*field
 }
 
 func (se structEncoder) encode(e *encodeState, v reflect.Value, opts encOpts) {
@@ -1033,8 +1034,7 @@ func appendString[Bytes []byte | string](dst []byte, src Bytes, escapeHTML bool)
 // A field represents a single field found in a struct.
 type field struct {
        name      string
-       nameBytes []byte                 // []byte(name)
-       equalFold func(s, t []byte) bool // bytes.EqualFold or equivalent
+       nameBytes []byte // []byte(name)
 
        nameNonEsc  string // `"` + name + `":`
        nameEscHTML string // `"` + HTMLEscape(name) + `":`
@@ -1161,7 +1161,6 @@ func typeFields(t reflect.Type) structFields {
                                                quoted:    quoted,
                                        }
                                        field.nameBytes = []byte(field.name)
-                                       field.equalFold = foldFunc(field.nameBytes)
 
                                        // Build nameEscHTML and nameNonEsc ahead of time.
                                        nameEscBuf = appendHTMLEscape(nameEscBuf[:0], field.nameBytes)
@@ -1240,11 +1239,16 @@ func typeFields(t reflect.Type) structFields {
                f := &fields[i]
                f.encoder = typeEncoder(typeByIndex(t, f.index))
        }
-       nameIndex := make(map[string]int, len(fields))
+       exactNameIndex := make(map[string]*field, len(fields))
+       foldedNameIndex := make(map[string]*field, len(fields))
        for i, field := range fields {
-               nameIndex[field.name] = i
+               exactNameIndex[field.name] = &fields[i]
+               // For historical reasons, first folded match takes precedence.
+               if _, ok := foldedNameIndex[string(foldName(field.nameBytes))]; !ok {
+                       foldedNameIndex[string(foldName(field.nameBytes))] = &fields[i]
+               }
        }
-       return structFields{fields, nameIndex}
+       return structFields{fields, exactNameIndex, foldedNameIndex}
 }
 
 // dominantField looks through the fields, all of which are known to
index 0f9b09d712da4184c2cbfbd14191293aa9ada693..c4c671b52765a41c66fe6b055ded986c22b469c7 100644 (file)
 package json
 
 import (
-       "bytes"
+       "unicode"
        "unicode/utf8"
 )
 
-const (
-       caseMask     = ^byte(0x20) // Mask to ignore case in ASCII.
-       kelvin       = '\u212a'
-       smallLongEss = '\u017f'
-)
-
-// foldFunc returns one of four different case folding equivalence
-// functions, from most general (and slow) to fastest:
-//
-// 1) bytes.EqualFold, if the key s contains any non-ASCII UTF-8
-// 2) equalFoldRight, if s contains special folding ASCII ('k', 'K', 's', 'S')
-// 3) asciiEqualFold, no special, but includes non-letters (including _)
-// 4) simpleLetterEqualFold, no specials, no non-letters.
-//
-// The letters S and K are special because they map to 3 runes, not just 2:
-//   - S maps to s and to U+017F 'ſ' Latin small letter long s
-//   - k maps to K and to U+212A 'K' Kelvin sign
-//
-// See https://play.golang.org/p/tTxjOc0OGo
-//
-// The returned function is specialized for matching against s and
-// should only be given s. It's not curried for performance reasons.
-func foldFunc(s []byte) func(s, t []byte) bool {
-       nonLetter := false
-       special := false // special letter
-       for _, b := range s {
-               if b >= utf8.RuneSelf {
-                       return bytes.EqualFold
-               }
-               upper := b & caseMask
-               if upper < 'A' || upper > 'Z' {
-                       nonLetter = true
-               } else if upper == 'K' || upper == 'S' {
-                       // See above for why these letters are special.
-                       special = true
-               }
-       }
-       if special {
-               return equalFoldRight
-       }
-       if nonLetter {
-               return asciiEqualFold
-       }
-       return simpleLetterEqualFold
+// foldName returns a folded string such that foldName(x) == foldName(y)
+// is identical to bytes.EqualFold(x, y).
+func foldName(in []byte) []byte {
+       // This is inlinable to take advantage of "function outlining".
+       var arr [32]byte // large enough for most JSON names
+       return appendFoldedName(arr[:0], in)
 }
 
-// equalFoldRight is a specialization of bytes.EqualFold when s is
-// known to be all ASCII (including punctuation), but contains an 's',
-// 'S', 'k', or 'K', requiring a Unicode fold on the bytes in t.
-// See comments on foldFunc.
-func equalFoldRight(s, t []byte) bool {
-       for _, sb := range s {
-               if len(t) == 0 {
-                       return false
-               }
-               tb := t[0]
-               if tb < utf8.RuneSelf {
-                       if sb != tb {
-                               sbUpper := sb & caseMask
-                               if 'A' <= sbUpper && sbUpper <= 'Z' {
-                                       if sbUpper != tb&caseMask {
-                                               return false
-                                       }
-                               } else {
-                                       return false
-                               }
+func appendFoldedName(out, in []byte) []byte {
+       for i := 0; i < len(in); {
+               // Handle single-byte ASCII.
+               if c := in[i]; c < utf8.RuneSelf {
+                       if 'a' <= c && c <= 'z' {
+                               c -= 'a' - 'A'
                        }
-                       t = t[1:]
+                       out = append(out, c)
+                       i++
                        continue
                }
-               // sb is ASCII and t is not. t must be either kelvin
-               // sign or long s; sb must be s, S, k, or K.
-               tr, size := utf8.DecodeRune(t)
-               switch sb {
-               case 's', 'S':
-                       if tr != smallLongEss {
-                               return false
-                       }
-               case 'k', 'K':
-                       if tr != kelvin {
-                               return false
-                       }
-               default:
-                       return false
-               }
-               t = t[size:]
-
-       }
-       return len(t) == 0
-}
-
-// asciiEqualFold is a specialization of bytes.EqualFold for use when
-// s is all ASCII (but may contain non-letters) and contains no
-// special-folding letters.
-// See comments on foldFunc.
-func asciiEqualFold(s, t []byte) bool {
-       if len(s) != len(t) {
-               return false
-       }
-       for i, sb := range s {
-               tb := t[i]
-               if sb == tb {
-                       continue
-               }
-               if ('a' <= sb && sb <= 'z') || ('A' <= sb && sb <= 'Z') {
-                       if sb&caseMask != tb&caseMask {
-                               return false
-                       }
-               } else {
-                       return false
-               }
+               // Handle multi-byte Unicode.
+               r, n := utf8.DecodeRune(in[i:])
+               out = utf8.AppendRune(out, foldRune(r))
+               i += n
        }
-       return true
+       return out
 }
 
-// simpleLetterEqualFold is a specialization of bytes.EqualFold for
-// use when s is all ASCII letters (no underscores, etc) and also
-// doesn't contain 'k', 'K', 's', or 'S'.
-// See comments on foldFunc.
-func simpleLetterEqualFold(s, t []byte) bool {
-       if len(s) != len(t) {
-               return false
-       }
-       for i, b := range s {
-               if b&caseMask != t[i]&caseMask {
-                       return false
+// foldRune is returns the smallest rune for all runes in the same fold set.
+func foldRune(r rune) rune {
+       for {
+               r2 := unicode.SimpleFold(r)
+               if r2 <= r {
+                       return r2
                }
+               r = r2
        }
-       return true
 }
index 4daa3590f559f7d77c631e35f6575fcf5fab7325..9d6fd0559d69eff938a5d1432b4bc3a56c5a2be9 100644 (file)
@@ -6,105 +6,45 @@ package json
 
 import (
        "bytes"
-       "strings"
        "testing"
-       "unicode/utf8"
 )
 
-var foldTests = []struct {
-       fn   func(s, t []byte) bool
-       s, t string
-       want bool
-}{
-       {equalFoldRight, "", "", true},
-       {equalFoldRight, "a", "a", true},
-       {equalFoldRight, "", "a", false},
-       {equalFoldRight, "a", "", false},
-       {equalFoldRight, "a", "A", true},
-       {equalFoldRight, "AB", "ab", true},
-       {equalFoldRight, "AB", "ac", false},
-       {equalFoldRight, "sbkKc", "ſbKKc", true},
-       {equalFoldRight, "SbKkc", "ſbKKc", true},
-       {equalFoldRight, "SbKkc", "ſbKK", false},
-       {equalFoldRight, "e", "é", false},
-       {equalFoldRight, "s", "S", true},
-
-       {simpleLetterEqualFold, "", "", true},
-       {simpleLetterEqualFold, "abc", "abc", true},
-       {simpleLetterEqualFold, "abc", "ABC", true},
-       {simpleLetterEqualFold, "abc", "ABCD", false},
-       {simpleLetterEqualFold, "abc", "xxx", false},
-
-       {asciiEqualFold, "a_B", "A_b", true},
-       {asciiEqualFold, "aa@", "aa`", false}, // verify 0x40 and 0x60 aren't case-equivalent
-}
-
-func TestFold(t *testing.T) {
-       for i, tt := range foldTests {
-               if got := tt.fn([]byte(tt.s), []byte(tt.t)); got != tt.want {
-                       t.Errorf("%d. %q, %q = %v; want %v", i, tt.s, tt.t, got, tt.want)
-               }
-               truth := strings.EqualFold(tt.s, tt.t)
-               if truth != tt.want {
-                       t.Errorf("strings.EqualFold doesn't agree with case %d", i)
-               }
+func FuzzEqualFold(f *testing.F) {
+       for _, ss := range [][2]string{
+               {"", ""},
+               {"123abc", "123ABC"},
+               {"αβδ", "ΑΒΔ"},
+               {"abc", "xyz"},
+               {"abc", "XYZ"},
+               {"1", "2"},
+               {"hello, world!", "hello, world!"},
+               {"hello, world!", "Hello, World!"},
+               {"hello, world!", "HELLO, WORLD!"},
+               {"hello, world!", "jello, world!"},
+               {"γειά, κόσμε!", "γειά, κόσμε!"},
+               {"γειά, κόσμε!", "Γειά, Κόσμε!"},
+               {"γειά, κόσμε!", "ΓΕΙΆ, ΚΌΣΜΕ!"},
+               {"γειά, κόσμε!", "ΛΕΙΆ, ΚΌΣΜΕ!"},
+               {"AESKey", "aesKey"},
+               {"AESKEY", "aes_key"},
+               {"aes_key", "AES_KEY"},
+               {"AES_KEY", "aes-key"},
+               {"aes-key", "AES-KEY"},
+               {"AES-KEY", "aesKey"},
+               {"aesKey", "AesKey"},
+               {"AesKey", "AESKey"},
+               {"AESKey", "aeskey"},
+               {"DESKey", "aeskey"},
+               {"AES Key", "aeskey"},
+       } {
+               f.Add([]byte(ss[0]), []byte(ss[1]))
        }
-}
-
-func TestFoldAgainstUnicode(t *testing.T) {
-       var buf1, buf2 []byte
-       var runes []rune
-       for i := 0x20; i <= 0x7f; i++ {
-               runes = append(runes, rune(i))
-       }
-       runes = append(runes, kelvin, smallLongEss)
-
-       funcs := []struct {
-               name   string
-               fold   func(s, t []byte) bool
-               letter bool // must be ASCII letter
-               simple bool // must be simple ASCII letter (not 'S' or 'K')
-       }{
-               {
-                       name: "equalFoldRight",
-                       fold: equalFoldRight,
-               },
-               {
-                       name:   "asciiEqualFold",
-                       fold:   asciiEqualFold,
-                       simple: true,
-               },
-               {
-                       name:   "simpleLetterEqualFold",
-                       fold:   simpleLetterEqualFold,
-                       simple: true,
-                       letter: true,
-               },
-       }
-
-       for _, ff := range funcs {
-               for _, r := range runes {
-                       if r >= utf8.RuneSelf {
-                               continue
-                       }
-                       if ff.letter && !isASCIILetter(byte(r)) {
-                               continue
-                       }
-                       if ff.simple && (r == 's' || r == 'S' || r == 'k' || r == 'K') {
-                               continue
-                       }
-                       for _, r2 := range runes {
-                               buf1 = append(utf8.AppendRune(append(buf1[:0], 'x'), r), 'x')
-                               buf2 = append(utf8.AppendRune(append(buf2[:0], 'x'), r2), 'x')
-                               want := bytes.EqualFold(buf1, buf2)
-                               if got := ff.fold(buf1, buf2); got != want {
-                                       t.Errorf("%s(%q, %q) = %v; want %v", ff.name, buf1, buf2, got, want)
-                               }
-                       }
+       equalFold := func(x, y []byte) bool { return string(foldName(x)) == string(foldName(y)) }
+       f.Fuzz(func(t *testing.T, x, y []byte) {
+               got := equalFold(x, y)
+               want := bytes.EqualFold(x, y)
+               if got != want {
+                       t.Errorf("equalFold(%q, %q) = %v, want %v", x, y, got, want)
                }
-       }
-}
-
-func isASCIILetter(b byte) bool {
-       return ('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z')
+       })
 }