From: Marcel van Lohuizen Date: Thu, 15 Nov 2012 21:23:56 +0000 (+0100) Subject: exp/locale/collate: changed implementation of Compare and CompareString to X-Git-Tag: go1.1rc2~1860 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=8b7ea6489c40f2e54d01836b2fae611c39fb09d4;p=gostls13.git exp/locale/collate: changed implementation of Compare and CompareString to compare incrementally. Also modified collation API to be more high-level by removing the need for an explicit buffer to be passed as an argument. This considerably speeds up Compare and CompareString. This change also eliminates the need to reinitialize the normalization buffer for each use of an iter. This also significantly improves performance for Key and KeyString. R=r, rsc CC=golang-dev https://golang.org/cl/6842050 --- diff --git a/src/pkg/exp/locale/collate/collate.go b/src/pkg/exp/locale/collate/collate.go index 2b2c8eba1f..a08dcae0d5 100644 --- a/src/pkg/exp/locale/collate/collate.go +++ b/src/pkg/exp/locale/collate/collate.go @@ -83,6 +83,13 @@ type Collator struct { f norm.Form t *table + + _iter [2]iter +} + +func (c *Collator) iter(i int) *iter { + // TODO: evaluate performance for making the second iterator optional. + return &c._iter[i] } // Locales returns the list of locales for which collating differs from its parent locale. @@ -100,11 +107,18 @@ func New(loc string) *Collator { t = mainTable.indexedTable(idx) } } - return &Collator{ + return newCollator(t) +} + +func newCollator(t *table) *Collator { + c := &Collator{ Strength: Quaternary, f: norm.NFD, t: t, } + c._iter[0].init(c) + c._iter[1].init(c) + return c } // SetVariableTop sets all runes with primary strength less than the primary @@ -113,63 +127,114 @@ func (c *Collator) SetVariableTop(r rune) { // TODO: implement } -// Buffer holds reusable buffers that can be used during collation. -// Reusing a Buffer for the various calls that accept it may avoid -// unnecessary memory allocations. +// Buffer holds keys generated by Key and KeyString. type Buffer struct { - // TODO: try various parameters and techniques, such as using - // a chan of buffers for a pool. - ba [4096]byte - wa [512]colElem + buf [4096]byte key []byte - ce []colElem } func (b *Buffer) init() { - if b.ce == nil { - b.ce = b.wa[:0] - b.key = b.ba[:0] - } else { - b.ce = b.ce[:0] + if b.key == nil { + b.key = b.buf[:0] } } -// ResetKeys clears the buffer used for generated keys. Calling ResetKeys -// invalidates keys previously obtained from Key or KeyFromString. -func (b *Buffer) ResetKeys() { - b.ce = b.ce[:0] +// Reset clears the buffer from previous results generated by Key and KeyString. +func (b *Buffer) Reset() { b.key = b.key[:0] } // Compare returns an integer comparing the two byte slices. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. -// Compare calls ResetKeys, thereby invalidating keys -// previously generated using Key or KeyFromString using buf. -func (c *Collator) Compare(buf *Buffer, a, b []byte) int { - // TODO: for now we simply compute keys and compare. Once we - // have good benchmarks, move to an implementation that works - // incrementally for the majority of cases. - // - Benchmark with long strings that only vary in modifiers. - buf.ResetKeys() - ka := c.Key(buf, a) - kb := c.Key(buf, b) - defer buf.ResetKeys() - return bytes.Compare(ka, kb) +func (c *Collator) Compare(a, b []byte) int { + // TODO: skip identical prefixes once we have a fast way to detect if a rune is + // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. + c.iter(0).setInput(c, a) + c.iter(1).setInput(c, b) + if res := c.compare(); res != 0 { + return res + } + if Identity == c.Strength { + return bytes.Compare(a, b) + } + return 0 } // CompareString returns an integer comparing the two strings. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. -// CompareString calls ResetKeys, thereby invalidating keys -// previously generated using Key or KeyFromString using buf. -func (c *Collator) CompareString(buf *Buffer, a, b string) int { - buf.ResetKeys() - ka := c.KeyFromString(buf, a) - kb := c.KeyFromString(buf, b) - defer buf.ResetKeys() - return bytes.Compare(ka, kb) +func (c *Collator) CompareString(a, b string) int { + // TODO: skip identical prefixes once we have a fast way to detect if a rune is + // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. + c.iter(0).setInputString(c, a) + c.iter(1).setInputString(c, b) + if res := c.compare(); res != 0 { + return res + } + if Identity == c.Strength { + if a < b { + return -1 + } else if a > b { + return 1 + } + } + return 0 +} + +func compareLevel(f func(i *iter) int, a, b *iter) int { + a.pce = 0 + b.pce = 0 + for { + va := f(a) + vb := f(b) + if va != vb { + if va < vb { + return -1 + } + return 1 + } else if va == 0 { + break + } + } + return 0 } -func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int { +func (c *Collator) compare() int { + ia, ib := c.iter(0), c.iter(1) + // Process primary level + if c.Alternate != AltShifted { + // TODO: implement script reordering + // TODO: special hiragana handling + if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 { + return res + } + } else { + // TODO: handle shifted + } + if Secondary <= c.Strength { + f := (*iter).nextSecondary + if c.Backwards { + f = (*iter).prevSecondary + } + if res := compareLevel(f, ia, ib); res != 0 { + return res + } + } + // TODO: special case handling (Danish?) + if Tertiary <= c.Strength || c.CaseLevel { + if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 { + return res + } + // TODO: Not needed for the default value of AltNonIgnorable? + if Quaternary <= c.Strength { + if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 { + return res + } + } + } + return 0 +} + +func (c *Collator) Prefix(s, prefix []byte) int { // iterate over s, track bytes consumed. return 0 } @@ -177,12 +242,11 @@ func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int { // Key returns the collation key for str. // Passing the buffer buf may avoid memory allocations. // The returned slice will point to an allocation in Buffer and will remain -// valid until the next call to buf.ResetKeys(). +// valid until the next call to buf.Reset(). func (c *Collator) Key(buf *Buffer, str []byte) []byte { // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. buf.init() - c.getColElems(buf, str) - return c.key(buf, buf.ce) + return c.key(buf, c.getColElems(str)) } // KeyFromString returns the collation key for str. @@ -192,8 +256,7 @@ func (c *Collator) Key(buf *Buffer, str []byte) []byte { func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. buf.init() - c.getColElemsString(buf, str) - return c.key(buf, buf.ce) + return c.key(buf, c.getColElemsString(str)) } func (c *Collator) key(buf *Buffer, w []colElem) []byte { @@ -203,35 +266,63 @@ func (c *Collator) key(buf *Buffer, w []colElem) []byte { return buf.key[kn:] } -func (c *Collator) getColElems(buf *Buffer, str []byte) { - i := c.iter() - i.src.SetInput(c.f, str) +func (c *Collator) getColElems(str []byte) []colElem { + i := c.iter(0) + i.setInput(c, str) for !i.done() { - buf.ce = i.next(buf.ce) + i.next() } + return i.ce } -func (c *Collator) getColElemsString(buf *Buffer, str string) { - i := c.iter() - i.src.SetInputString(c.f, str) +func (c *Collator) getColElemsString(str string) []colElem { + i := c.iter(0) + i.setInputString(c, str) for !i.done() { - buf.ce = i.next(buf.ce) + i.next() } + return i.ce } type iter struct { src norm.Iter - ba [1024]byte + norm [1024]byte buf []byte - t *table p int minBufSize int + + wa [512]colElem + ce []colElem + pce int + + t *table _done, eof bool } -func (c *Collator) iter() iter { - i := iter{t: c.t, minBufSize: c.t.maxContractLen} - i.buf = i.ba[:0] +func (i *iter) init(c *Collator) { + i.t = c.t + i.minBufSize = c.t.maxContractLen + i.ce = i.wa[:0] + i.buf = i.norm[:0] +} + +func (i *iter) reset() { + i.ce = i.ce[:0] + i.buf = i.buf[:0] + i.p = 0 + i.eof = i.src.Done() + i._done = i.eof +} + +func (i *iter) setInput(c *Collator, s []byte) *iter { + i.src.SetInput(c.f, s) + i.reset() + return i +} + +func (i *iter) setInputString(c *Collator, s string) *iter { + i.src.SetInputString(c.f, s) + i.reset() return i } @@ -239,7 +330,7 @@ func (i *iter) done() bool { return i._done } -func (i *iter) next(ce []colElem) []colElem { +func (i *iter) next() { if !i.eof && len(i.buf)-i.p < i.minBufSize { // replenish buffer n := copy(i.buf, i.buf[i.p:]) @@ -250,11 +341,67 @@ func (i *iter) next(ce []colElem) []colElem { } if i.p == len(i.buf) { i._done = true - return ce + return } - ce, sz := i.t.appendNext(ce, i.buf[i.p:]) + sz := 0 + i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:]) i.p += sz - return ce +} + +func (i *iter) nextPrimary() int { + for { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[i.pce].primary(); v != 0 { + i.pce++ + return v + } + } + if i.done() { + return 0 + } + i.next() + } + panic("should not reach here") +} + +func (i *iter) nextSecondary() int { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[i.pce].secondary(); v != 0 { + i.pce++ + return v + } + } + return 0 +} + +func (i *iter) prevSecondary() int { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[len(i.ce)-i.pce-1].secondary(); v != 0 { + i.pce++ + return v + } + } + return 0 +} + +func (i *iter) nextTertiary() int { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[i.pce].tertiary(); v != 0 { + i.pce++ + return int(v) + } + } + return 0 +} + +func (i *iter) nextQuaternary() int { + for ; i.pce < len(i.ce); i.pce++ { + if v := i.ce[i.pce].quaternary(); v != 0 { + i.pce++ + return v + } + } + return 0 } func appendPrimary(key []byte, p int) []byte { diff --git a/src/pkg/exp/locale/collate/collate_test.go b/src/pkg/exp/locale/collate/collate_test.go index fa7c30262a..6d6d6bce24 100644 --- a/src/pkg/exp/locale/collate/collate_test.go +++ b/src/pkg/exp/locale/collate/collate_test.go @@ -300,7 +300,7 @@ var keyFromElemTests = []keyFromElemTest{ func TestKeyFromElems(t *testing.T) { buf := collate.Buffer{} for i, tt := range keyFromElemTests { - buf.ResetKeys() + buf.Reset() ws := collate.ProcessWeights(tt.opt.alt, tt.opt.top, tt.in) res := collate.KeyFromElems(tt.opt.collator(), &buf, ws) if len(res) != len(tt.out) { @@ -325,7 +325,6 @@ func TestGetColElems(t *testing.T) { // error is reported in TestAppendNext continue } - buf := collate.Buffer{} // Create one large test per table str := make([]byte, 0, 4000) out := ColElems{} @@ -336,7 +335,7 @@ func TestGetColElems(t *testing.T) { } } for j, chk := range append(tt.chk, check{string(str), len(str), out}) { - ws := collate.GetColElems(c, &buf, []byte(chk.in)[:chk.n]) + ws := collate.GetColElems(c, []byte(chk.in)[:chk.n]) if len(ws) != len(chk.out) { t.Errorf("%d:%d: len(ws) was %d; want %d", i, j, len(ws), len(chk.out)) continue @@ -404,19 +403,27 @@ type compareTest struct { var compareTests = []compareTest{ {"a\u0301", "a", 1}, + {"a\u0301b", "ab", 1}, {"a", "a\u0301", -1}, + {"ab", "a\u0301b", -1}, + {"bc", "a\u0301c", 1}, + {"ab", "aB", -1}, {"a\u0301", "a\u0301", 0}, {"a", "a", 0}, + // Only clip prefixes of whole runes. + {"\u302E", "\u302F", 1}, + // Don't clip prefixes when last rune of prefix may be part of contraction. + {"a\u035E", "a\u0301\u035F", -1}, + {"a\u0301\u035Fb", "a\u0301\u035F", -1}, } func TestCompare(t *testing.T) { c, _ := makeTable(appendNextTests[4].in) - buf := collate.Buffer{} for i, tt := range compareTests { - if res := c.Compare(&buf, []byte(tt.a), []byte(tt.b)); res != tt.res { + if res := c.Compare([]byte(tt.a), []byte(tt.b)); res != tt.res { t.Errorf("%d: Compare(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res) } - if res := c.CompareString(&buf, tt.a, tt.b); res != tt.res { + if res := c.CompareString(tt.a, tt.b); res != tt.res { t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res) } } diff --git a/src/pkg/exp/locale/collate/export.go b/src/pkg/exp/locale/collate/export.go index 01750dd070..1915c93963 100644 --- a/src/pkg/exp/locale/collate/export.go +++ b/src/pkg/exp/locale/collate/export.go @@ -4,8 +4,6 @@ package collate -import "exp/norm" - // Init is used by type Builder in exp/locale/collate/build/ // to create Collator instances. It is for internal use only. func Init(data interface{}) *Collator { @@ -24,11 +22,7 @@ func Init(data interface{}) *Collator { t.contractElem = init.ContractElems() t.maxContractLen = init.MaxContractLen() t.variableTop = init.VariableTop() - return &Collator{ - Strength: Quaternary, - f: norm.NFD, - t: t, - } + return newCollator(t) } type tableInitializer interface { diff --git a/src/pkg/exp/locale/collate/export_test.go b/src/pkg/exp/locale/collate/export_test.go index 09caccac5f..3afe664579 100644 --- a/src/pkg/exp/locale/collate/export_test.go +++ b/src/pkg/exp/locale/collate/export_test.go @@ -72,10 +72,9 @@ func SetTop(c *Collator, top int) { c.t.variableTop = uint32(top) } -func GetColElems(c *Collator, buf *Buffer, str []byte) []Weights { - buf.ResetKeys() - c.getColElems(buf, str) - return convertToWeights(buf.ce) +func GetColElems(c *Collator, str []byte) []Weights { + ce := c.getColElems(str) + return convertToWeights(ce) } func ProcessWeights(h AlternateHandling, top int, w []Weights) []Weights { diff --git a/src/pkg/exp/locale/collate/maketables.go b/src/pkg/exp/locale/collate/maketables.go index 80e3c394f1..40bf10ab44 100644 --- a/src/pkg/exp/locale/collate/maketables.go +++ b/src/pkg/exp/locale/collate/maketables.go @@ -180,7 +180,7 @@ func skipAlt(a string) bool { func failOnError(e error) { if e != nil { - log.Fatal(e) + log.Panic(e) } } @@ -677,7 +677,7 @@ func testCollator(c *collate.Collator) { if bytes.Compare(k0, k) != 0 { failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k)) } - buf.ResetKeys() + buf.Reset() } fmt.Println("PASS") } diff --git a/src/pkg/exp/locale/collate/regtest.go b/src/pkg/exp/locale/collate/regtest.go index 14a447c1e4..e30915ed89 100644 --- a/src/pkg/exp/locale/collate/regtest.go +++ b/src/pkg/exp/locale/collate/regtest.go @@ -236,9 +236,9 @@ func doTest(t Test) { if strings.Contains(t.name, "NON_IGNOR") { c.Alternate = collate.AltNonIgnorable } - prev := t.str[0] for i := 1; i < len(t.str); i++ { + b.Reset() s := t.str[i] ka := c.Key(b, prev) kb := c.Key(b, s) @@ -247,10 +247,10 @@ func doTest(t Test) { prev = s continue } - if r := c.Compare(b, prev, s); r == 1 { + if r := c.Compare(prev, s); r == 1 { fail(t, "%d: Compare(%.4X, %.4X) == %d; want -1 or 0", i, runes(prev), runes(s), r) } - if r := c.Compare(b, s, prev); r == -1 { + if r := c.Compare(s, prev); r == -1 { fail(t, "%d: Compare(%.4X, %.4X) == %d; want 1 or 0", i, runes(s), runes(prev), r) } prev = s diff --git a/src/pkg/exp/locale/collate/table_test.go b/src/pkg/exp/locale/collate/table_test.go index 446d592b6d..95b74fdc51 100644 --- a/src/pkg/exp/locale/collate/table_test.go +++ b/src/pkg/exp/locale/collate/table_test.go @@ -141,7 +141,7 @@ var appendNextTests = []tableTest{ {"\u0316", [][]int{{0, 220}}}, {"\u0317", [][]int{{0, 220}, {0, 220}}}, {"\u302D", [][]int{{0, 222}}}, - {"\u302E", [][]int{{0, 224}}}, // used as starter + {"\u302E", [][]int{{0, 225}}}, // used as starter {"\u302F", [][]int{{0, 224}}}, // used as starter {"\u18A9", [][]int{{0, 228}}}, {"\u0300", [][]int{{0, 230}}}, @@ -169,7 +169,7 @@ var appendNextTests = []tableTest{ {"a\u035Db\u035D", [][]int{{117}}}, {"a\u0301\u035Db", [][]int{{120}}}, {"a\u0301\u035F", [][]int{{121}}}, - {"a\u0301\u035Fb", [][]int{{122}}}, + {"a\u0301\u035Fb", [][]int{{119}}}, {"\u03B1\u0345", [][]int{{901}, {902}}}, {"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}}, {"\u302F\u18A9", [][]int{{0, 130}}}, @@ -192,7 +192,7 @@ var appendNextTests = []tableTest{ // multiple gaps {"a\u0301\u035Db", 6, ColElems{w(120)}}, {"a\u0301\u035F", 5, ColElems{w(121)}}, - {"a\u0301\u035Fb", 6, ColElems{w(122)}}, + {"a\u0301\u035Fb", 6, ColElems{w(119)}}, {"a\u0316\u0301\u035F", 7, ColElems{w(121), w(0, 220)}}, {"a\u0301\u0315\u035Fb", 7, ColElems{w(121), w(0, 232)}}, {"a\u0316\u0301\u0315\u035Db", 5, ColElems{w(102), w(0, 220)}}, diff --git a/src/pkg/exp/locale/collate/tools/colcmp/col.go b/src/pkg/exp/locale/collate/tools/colcmp/col.go index 26e015cb2f..3f8d7eed65 100644 --- a/src/pkg/exp/locale/collate/tools/colcmp/col.go +++ b/src/pkg/exp/locale/collate/tools/colcmp/col.go @@ -91,5 +91,5 @@ func (c *goCollator) Key(b Input) []byte { } func (c *goCollator) Compare(a, b Input) int { - return c.c.Compare(&c.buf, a.UTF8, b.UTF8) + return c.c.Compare(a.UTF8, b.UTF8) }