f norm.Form
t *table
+
+ _iter [2]iter
+}
+
+func (c *Collator) iter(i int) *iter {
+ // TODO: evaluate performance for making the second iterator optional.
+ return &c._iter[i]
}
// Locales returns the list of locales for which collating differs from its parent locale.
t = mainTable.indexedTable(idx)
}
}
- return &Collator{
+ return newCollator(t)
+}
+
+func newCollator(t *table) *Collator {
+ c := &Collator{
Strength: Quaternary,
f: norm.NFD,
t: t,
}
+ c._iter[0].init(c)
+ c._iter[1].init(c)
+ return c
}
// SetVariableTop sets all runes with primary strength less than the primary
// TODO: implement
}
-// Buffer holds reusable buffers that can be used during collation.
-// Reusing a Buffer for the various calls that accept it may avoid
-// unnecessary memory allocations.
+// Buffer holds keys generated by Key and KeyString.
type Buffer struct {
- // TODO: try various parameters and techniques, such as using
- // a chan of buffers for a pool.
- ba [4096]byte
- wa [512]colElem
+ buf [4096]byte
key []byte
- ce []colElem
}
func (b *Buffer) init() {
- if b.ce == nil {
- b.ce = b.wa[:0]
- b.key = b.ba[:0]
- } else {
- b.ce = b.ce[:0]
+ if b.key == nil {
+ b.key = b.buf[:0]
}
}
-// ResetKeys clears the buffer used for generated keys. Calling ResetKeys
-// invalidates keys previously obtained from Key or KeyFromString.
-func (b *Buffer) ResetKeys() {
- b.ce = b.ce[:0]
+// Reset clears the buffer from previous results generated by Key and KeyString.
+func (b *Buffer) Reset() {
b.key = b.key[:0]
}
// Compare returns an integer comparing the two byte slices.
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
-// Compare calls ResetKeys, thereby invalidating keys
-// previously generated using Key or KeyFromString using buf.
-func (c *Collator) Compare(buf *Buffer, a, b []byte) int {
- // TODO: for now we simply compute keys and compare. Once we
- // have good benchmarks, move to an implementation that works
- // incrementally for the majority of cases.
- // - Benchmark with long strings that only vary in modifiers.
- buf.ResetKeys()
- ka := c.Key(buf, a)
- kb := c.Key(buf, b)
- defer buf.ResetKeys()
- return bytes.Compare(ka, kb)
+func (c *Collator) Compare(a, b []byte) int {
+ // TODO: skip identical prefixes once we have a fast way to detect if a rune is
+ // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
+ c.iter(0).setInput(c, a)
+ c.iter(1).setInput(c, b)
+ if res := c.compare(); res != 0 {
+ return res
+ }
+ if Identity == c.Strength {
+ return bytes.Compare(a, b)
+ }
+ return 0
}
// CompareString returns an integer comparing the two strings.
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
-// CompareString calls ResetKeys, thereby invalidating keys
-// previously generated using Key or KeyFromString using buf.
-func (c *Collator) CompareString(buf *Buffer, a, b string) int {
- buf.ResetKeys()
- ka := c.KeyFromString(buf, a)
- kb := c.KeyFromString(buf, b)
- defer buf.ResetKeys()
- return bytes.Compare(ka, kb)
+func (c *Collator) CompareString(a, b string) int {
+ // TODO: skip identical prefixes once we have a fast way to detect if a rune is
+ // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
+ c.iter(0).setInputString(c, a)
+ c.iter(1).setInputString(c, b)
+ if res := c.compare(); res != 0 {
+ return res
+ }
+ if Identity == c.Strength {
+ if a < b {
+ return -1
+ } else if a > b {
+ return 1
+ }
+ }
+ return 0
+}
+
+func compareLevel(f func(i *iter) int, a, b *iter) int {
+ a.pce = 0
+ b.pce = 0
+ for {
+ va := f(a)
+ vb := f(b)
+ if va != vb {
+ if va < vb {
+ return -1
+ }
+ return 1
+ } else if va == 0 {
+ break
+ }
+ }
+ return 0
}
-func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int {
+func (c *Collator) compare() int {
+ ia, ib := c.iter(0), c.iter(1)
+ // Process primary level
+ if c.Alternate != AltShifted {
+ // TODO: implement script reordering
+ // TODO: special hiragana handling
+ if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
+ return res
+ }
+ } else {
+ // TODO: handle shifted
+ }
+ if Secondary <= c.Strength {
+ f := (*iter).nextSecondary
+ if c.Backwards {
+ f = (*iter).prevSecondary
+ }
+ if res := compareLevel(f, ia, ib); res != 0 {
+ return res
+ }
+ }
+ // TODO: special case handling (Danish?)
+ if Tertiary <= c.Strength || c.CaseLevel {
+ if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
+ return res
+ }
+ // TODO: Not needed for the default value of AltNonIgnorable?
+ if Quaternary <= c.Strength {
+ if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
+ return res
+ }
+ }
+ }
+ return 0
+}
+
+func (c *Collator) Prefix(s, prefix []byte) int {
// iterate over s, track bytes consumed.
return 0
}
// Key returns the collation key for str.
// Passing the buffer buf may avoid memory allocations.
// The returned slice will point to an allocation in Buffer and will remain
-// valid until the next call to buf.ResetKeys().
+// valid until the next call to buf.Reset().
func (c *Collator) Key(buf *Buffer, str []byte) []byte {
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
buf.init()
- c.getColElems(buf, str)
- return c.key(buf, buf.ce)
+ return c.key(buf, c.getColElems(str))
}
// KeyFromString returns the collation key for str.
func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
buf.init()
- c.getColElemsString(buf, str)
- return c.key(buf, buf.ce)
+ return c.key(buf, c.getColElemsString(str))
}
func (c *Collator) key(buf *Buffer, w []colElem) []byte {
return buf.key[kn:]
}
-func (c *Collator) getColElems(buf *Buffer, str []byte) {
- i := c.iter()
- i.src.SetInput(c.f, str)
+func (c *Collator) getColElems(str []byte) []colElem {
+ i := c.iter(0)
+ i.setInput(c, str)
for !i.done() {
- buf.ce = i.next(buf.ce)
+ i.next()
}
+ return i.ce
}
-func (c *Collator) getColElemsString(buf *Buffer, str string) {
- i := c.iter()
- i.src.SetInputString(c.f, str)
+func (c *Collator) getColElemsString(str string) []colElem {
+ i := c.iter(0)
+ i.setInputString(c, str)
for !i.done() {
- buf.ce = i.next(buf.ce)
+ i.next()
}
+ return i.ce
}
type iter struct {
src norm.Iter
- ba [1024]byte
+ norm [1024]byte
buf []byte
- t *table
p int
minBufSize int
+
+ wa [512]colElem
+ ce []colElem
+ pce int
+
+ t *table
_done, eof bool
}
-func (c *Collator) iter() iter {
- i := iter{t: c.t, minBufSize: c.t.maxContractLen}
- i.buf = i.ba[:0]
+func (i *iter) init(c *Collator) {
+ i.t = c.t
+ i.minBufSize = c.t.maxContractLen
+ i.ce = i.wa[:0]
+ i.buf = i.norm[:0]
+}
+
+func (i *iter) reset() {
+ i.ce = i.ce[:0]
+ i.buf = i.buf[:0]
+ i.p = 0
+ i.eof = i.src.Done()
+ i._done = i.eof
+}
+
+func (i *iter) setInput(c *Collator, s []byte) *iter {
+ i.src.SetInput(c.f, s)
+ i.reset()
+ return i
+}
+
+func (i *iter) setInputString(c *Collator, s string) *iter {
+ i.src.SetInputString(c.f, s)
+ i.reset()
return i
}
return i._done
}
-func (i *iter) next(ce []colElem) []colElem {
+func (i *iter) next() {
if !i.eof && len(i.buf)-i.p < i.minBufSize {
// replenish buffer
n := copy(i.buf, i.buf[i.p:])
}
if i.p == len(i.buf) {
i._done = true
- return ce
+ return
}
- ce, sz := i.t.appendNext(ce, i.buf[i.p:])
+ sz := 0
+ i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:])
i.p += sz
- return ce
+}
+
+func (i *iter) nextPrimary() int {
+ for {
+ for ; i.pce < len(i.ce); i.pce++ {
+ if v := i.ce[i.pce].primary(); v != 0 {
+ i.pce++
+ return v
+ }
+ }
+ if i.done() {
+ return 0
+ }
+ i.next()
+ }
+ panic("should not reach here")
+}
+
+func (i *iter) nextSecondary() int {
+ for ; i.pce < len(i.ce); i.pce++ {
+ if v := i.ce[i.pce].secondary(); v != 0 {
+ i.pce++
+ return v
+ }
+ }
+ return 0
+}
+
+func (i *iter) prevSecondary() int {
+ for ; i.pce < len(i.ce); i.pce++ {
+ if v := i.ce[len(i.ce)-i.pce-1].secondary(); v != 0 {
+ i.pce++
+ return v
+ }
+ }
+ return 0
+}
+
+func (i *iter) nextTertiary() int {
+ for ; i.pce < len(i.ce); i.pce++ {
+ if v := i.ce[i.pce].tertiary(); v != 0 {
+ i.pce++
+ return int(v)
+ }
+ }
+ return 0
+}
+
+func (i *iter) nextQuaternary() int {
+ for ; i.pce < len(i.ce); i.pce++ {
+ if v := i.ce[i.pce].quaternary(); v != 0 {
+ i.pce++
+ return v
+ }
+ }
+ return 0
}
func appendPrimary(key []byte, p int) []byte {
func TestKeyFromElems(t *testing.T) {
buf := collate.Buffer{}
for i, tt := range keyFromElemTests {
- buf.ResetKeys()
+ buf.Reset()
ws := collate.ProcessWeights(tt.opt.alt, tt.opt.top, tt.in)
res := collate.KeyFromElems(tt.opt.collator(), &buf, ws)
if len(res) != len(tt.out) {
// error is reported in TestAppendNext
continue
}
- buf := collate.Buffer{}
// Create one large test per table
str := make([]byte, 0, 4000)
out := ColElems{}
}
}
for j, chk := range append(tt.chk, check{string(str), len(str), out}) {
- ws := collate.GetColElems(c, &buf, []byte(chk.in)[:chk.n])
+ ws := collate.GetColElems(c, []byte(chk.in)[:chk.n])
if len(ws) != len(chk.out) {
t.Errorf("%d:%d: len(ws) was %d; want %d", i, j, len(ws), len(chk.out))
continue
var compareTests = []compareTest{
{"a\u0301", "a", 1},
+ {"a\u0301b", "ab", 1},
{"a", "a\u0301", -1},
+ {"ab", "a\u0301b", -1},
+ {"bc", "a\u0301c", 1},
+ {"ab", "aB", -1},
{"a\u0301", "a\u0301", 0},
{"a", "a", 0},
+ // Only clip prefixes of whole runes.
+ {"\u302E", "\u302F", 1},
+ // Don't clip prefixes when last rune of prefix may be part of contraction.
+ {"a\u035E", "a\u0301\u035F", -1},
+ {"a\u0301\u035Fb", "a\u0301\u035F", -1},
}
func TestCompare(t *testing.T) {
c, _ := makeTable(appendNextTests[4].in)
- buf := collate.Buffer{}
for i, tt := range compareTests {
- if res := c.Compare(&buf, []byte(tt.a), []byte(tt.b)); res != tt.res {
+ if res := c.Compare([]byte(tt.a), []byte(tt.b)); res != tt.res {
t.Errorf("%d: Compare(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
}
- if res := c.CompareString(&buf, tt.a, tt.b); res != tt.res {
+ if res := c.CompareString(tt.a, tt.b); res != tt.res {
t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
}
}
{"\u0316", [][]int{{0, 220}}},
{"\u0317", [][]int{{0, 220}, {0, 220}}},
{"\u302D", [][]int{{0, 222}}},
- {"\u302E", [][]int{{0, 224}}}, // used as starter
+ {"\u302E", [][]int{{0, 225}}}, // used as starter
{"\u302F", [][]int{{0, 224}}}, // used as starter
{"\u18A9", [][]int{{0, 228}}},
{"\u0300", [][]int{{0, 230}}},
{"a\u035Db\u035D", [][]int{{117}}},
{"a\u0301\u035Db", [][]int{{120}}},
{"a\u0301\u035F", [][]int{{121}}},
- {"a\u0301\u035Fb", [][]int{{122}}},
+ {"a\u0301\u035Fb", [][]int{{119}}},
{"\u03B1\u0345", [][]int{{901}, {902}}},
{"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}},
{"\u302F\u18A9", [][]int{{0, 130}}},
// multiple gaps
{"a\u0301\u035Db", 6, ColElems{w(120)}},
{"a\u0301\u035F", 5, ColElems{w(121)}},
- {"a\u0301\u035Fb", 6, ColElems{w(122)}},
+ {"a\u0301\u035Fb", 6, ColElems{w(119)}},
{"a\u0316\u0301\u035F", 7, ColElems{w(121), w(0, 220)}},
{"a\u0301\u0315\u035Fb", 7, ColElems{w(121), w(0, 232)}},
{"a\u0316\u0301\u0315\u035Db", 5, ColElems{w(102), w(0, 220)}},