From b8e533a7cdc60d84a0c52bfaf3dcb5bf148ac3a8 Mon Sep 17 00:00:00 2001 From: Charlie Vieth Date: Fri, 2 Dec 2022 22:53:26 -0500 Subject: [PATCH] unicode: improve SimpleFold performance by 2x for non-foldable code points MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Change SimpleFold to search the CaseRanges table only once when no folding is specified for the rune (previously up to two searches could be performed). This improves performance by 2x for runes that have no folds or are already upper case. As a side effect this improves the performance of To by roughly ~15% goos: darwin goarch: arm64 pkg: unicode cpu: Apple M1 Max │ base.10.txt │ new.10.txt │ │ sec/op │ sec/op vs base │ ToUpper-10 11.860n ± 1% 9.731n ± 1% -17.95% (p=0.000 n=10) ToLower-10 12.31n ± 1% 10.34n ± 1% -16.00% (p=0.000 n=10) SimpleFold/Upper-10 19.16n ± 0% 15.98n ± 1% -16.64% (p=0.000 n=10) SimpleFold/Lower-10 32.41n ± 1% 17.09n ± 1% -47.27% (p=0.000 n=10) SimpleFold/Fold-10 8.884n ± 4% 8.856n ± 8% ~ (p=0.700 n=10) SimpleFold/NoFold-10 30.87n ± 0% 15.49n ± 3% -49.84% (p=0.000 n=10) geomean 17.09n 12.47n -26.99% Change-Id: I6e5c7554106842955aadeef7b266c4c7944d3a97 Reviewed-on: https://go-review.googlesource.com/c/go/+/454958 Reviewed-by: Ian Lance Taylor Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Auto-Submit: Ian Lance Taylor --- src/unicode/letter.go | 67 ++++++++++++++++++++++++-------------- src/unicode/letter_test.go | 26 +++++++++++++++ 2 files changed, 68 insertions(+), 25 deletions(-) diff --git a/src/unicode/letter.go b/src/unicode/letter.go index 9e2cead631..3959314c97 100644 --- a/src/unicode/letter.go +++ b/src/unicode/letter.go @@ -206,34 +206,17 @@ func IsTitle(r rune) bool { return isExcludingLatin(Title, r) } -// to maps the rune using the specified case mapping. -// It additionally reports whether caseRange contained a mapping for r. -func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) { - if _case < 0 || MaxCase <= _case { - return ReplacementChar, false // as reasonable an error as any - } +// lookupCaseRange returns the CaseRange mapping for rune r or nil if no +// mapping exists for r. +func lookupCaseRange(r rune, caseRange []CaseRange) *CaseRange { // binary search over ranges lo := 0 hi := len(caseRange) for lo < hi { m := int(uint(lo+hi) >> 1) - cr := caseRange[m] + cr := &caseRange[m] if rune(cr.Lo) <= r && r <= rune(cr.Hi) { - delta := cr.Delta[_case] - if delta > MaxRune { - // In an Upper-Lower sequence, which always starts with - // an UpperCase letter, the real deltas always look like: - // {0, 1, 0} UpperCase (Lower is next) - // {-1, 0, -1} LowerCase (Upper, Title are previous) - // The characters at even offsets from the beginning of the - // sequence are upper case; the ones at odd offsets are lower. - // The correct mapping can be done by clearing or setting the low - // bit in the sequence offset. - // The constants UpperCase and TitleCase are even while LowerCase - // is odd so we take the low bit from _case. - return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true - } - return r + delta, true + return cr } if r < rune(cr.Lo) { hi = m @@ -241,6 +224,37 @@ func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping lo = m + 1 } } + return nil +} + +// convertCase converts r to _case using CaseRange cr. +func convertCase(_case int, r rune, cr *CaseRange) rune { + delta := cr.Delta[_case] + if delta > MaxRune { + // In an Upper-Lower sequence, which always starts with + // an UpperCase letter, the real deltas always look like: + // {0, 1, 0} UpperCase (Lower is next) + // {-1, 0, -1} LowerCase (Upper, Title are previous) + // The characters at even offsets from the beginning of the + // sequence are upper case; the ones at odd offsets are lower. + // The correct mapping can be done by clearing or setting the low + // bit in the sequence offset. + // The constants UpperCase and TitleCase are even while LowerCase + // is odd so we take the low bit from _case. + return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)) + } + return r + delta +} + +// to maps the rune using the specified case mapping. +// It additionally reports whether caseRange contained a mapping for r. +func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) { + if _case < 0 || MaxCase <= _case { + return ReplacementChar, false // as reasonable an error as any + } + if cr := lookupCaseRange(r, caseRange); cr != nil { + return convertCase(_case, r, cr), true + } return r, false } @@ -364,8 +378,11 @@ func SimpleFold(r rune) rune { // No folding specified. This is a one- or two-element // equivalence class containing rune and ToLower(rune) // and ToUpper(rune) if they are different from rune. - if l := ToLower(r); l != r { - return l + if cr := lookupCaseRange(r, CaseRanges); cr != nil { + if l := convertCase(LowerCase, r, cr); l != r { + return l + } + return convertCase(UpperCase, r, cr) } - return ToUpper(r) + return r } diff --git a/src/unicode/letter_test.go b/src/unicode/letter_test.go index 123f9a642e..75c8aeee90 100644 --- a/src/unicode/letter_test.go +++ b/src/unicode/letter_test.go @@ -642,3 +642,29 @@ func TestNegativeRune(t *testing.T) { } } } + +func BenchmarkToUpper(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = ToUpper('δ') + } +} + +func BenchmarkToLower(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = ToLower('Δ') + } +} + +func BenchmarkSimpleFold(b *testing.B) { + bench := func(name string, r rune) { + b.Run(name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = SimpleFold(r) + } + }) + } + bench("Upper", 'Δ') + bench("Lower", 'δ') + bench("Fold", '\u212A') + bench("NoFold", '習') +} -- 2.48.1