exp/locale/collate: changed implementation of Compare and CompareString to

author Marcel van Lohuizen <mpvl@golang.org>

Thu, 15 Nov 2012 21:23:56 +0000 (22:23 +0100)

committer Marcel van Lohuizen <mpvl@golang.org>

Thu, 15 Nov 2012 21:23:56 +0000 (22:23 +0100)
author Marcel van Lohuizen <mpvl@golang.org>
Thu, 15 Nov 2012 21:23:56 +0000 (22:23 +0100)
committer Marcel van Lohuizen <mpvl@golang.org>
Thu, 15 Nov 2012 21:23:56 +0000 (22:23 +0100)
diff --git a/src/pkg/exp/locale/collate/collate.go b/src/pkg/exp/locale/collate/collate.go

index 2b2c8eba1f6059d31dc9054037c5ddee02de65ce..a08dcae0d5f84f3ae219eeafb324532bcf5ed212 100644 (file)
--- a/src/pkg/exp/locale/collate/collate.go
+++ b/src/pkg/exp/locale/collate/collate.go
@@ -83,6 +83,13 @@ type Collator struct {
         f norm.Form
  
         t *table
+
+       _iter [2]iter
+}
+
+func (c *Collator) iter(i int) *iter {
+       // TODO: evaluate performance for making the second iterator optional.
+       return &c._iter[i]
  }
  
  // Locales returns the list of locales for which collating differs from its parent locale.
@@ -100,11 +107,18 @@ func New(loc string) *Collator {
                         t = mainTable.indexedTable(idx)
                 }
         }
-       return &Collator{
+       return newCollator(t)
+}
+
+func newCollator(t *table) *Collator {
+       c := &Collator{
                 Strength: Quaternary,
                 f:        norm.NFD,
                 t:        t,
         }
+       c._iter[0].init(c)
+       c._iter[1].init(c)
+       return c
  }
  
  // SetVariableTop sets all runes with primary strength less than the primary
@@ -113,63 +127,114 @@ func (c *Collator) SetVariableTop(r rune) {
         // TODO: implement
  }
  
-// Buffer holds reusable buffers that can be used during collation.
-// Reusing a Buffer for the various calls that accept it may avoid
-// unnecessary memory allocations.
+// Buffer holds keys generated by Key and KeyString.
  type Buffer struct {
-       // TODO: try various parameters and techniques, such as using
-       // a chan of buffers for a pool.
-       ba  [4096]byte
-       wa  [512]colElem
+       buf [4096]byte
         key []byte
-       ce  []colElem
  }
  
  func (b *Buffer) init() {
-       if b.ce == nil {
-               b.ce = b.wa[:0]
-               b.key = b.ba[:0]
-       } else {
-               b.ce = b.ce[:0]
+       if b.key == nil {
+               b.key = b.buf[:0]
         }
  }
  
-// ResetKeys clears the buffer used for generated keys. Calling ResetKeys
-// invalidates keys previously obtained from Key or KeyFromString.
-func (b *Buffer) ResetKeys() {
-       b.ce = b.ce[:0]
+// Reset clears the buffer from previous results generated by Key and KeyString.
+func (b *Buffer) Reset() {
         b.key = b.key[:0]
  }
  
  // Compare returns an integer comparing the two byte slices.
  // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
-// Compare calls ResetKeys, thereby invalidating keys
-// previously generated using Key or KeyFromString using buf.
-func (c *Collator) Compare(buf *Buffer, a, b []byte) int {
-       // TODO: for now we simply compute keys and compare.  Once we
-       // have good benchmarks, move to an implementation that works
-       // incrementally for the majority of cases.
-       // - Benchmark with long strings that only vary in modifiers.
-       buf.ResetKeys()
-       ka := c.Key(buf, a)
-       kb := c.Key(buf, b)
-       defer buf.ResetKeys()
-       return bytes.Compare(ka, kb)
+func (c *Collator) Compare(a, b []byte) int {
+       // TODO: skip identical prefixes once we have a fast way to detect if a rune is
+       // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
+       c.iter(0).setInput(c, a)
+       c.iter(1).setInput(c, b)
+       if res := c.compare(); res != 0 {
+               return res
+       }
+       if Identity == c.Strength {
+               return bytes.Compare(a, b)
+       }
+       return 0
  }
  
  // CompareString returns an integer comparing the two strings.
  // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
-// CompareString calls ResetKeys, thereby invalidating keys
-// previously generated using Key or KeyFromString using buf.
-func (c *Collator) CompareString(buf *Buffer, a, b string) int {
-       buf.ResetKeys()
-       ka := c.KeyFromString(buf, a)
-       kb := c.KeyFromString(buf, b)
-       defer buf.ResetKeys()
-       return bytes.Compare(ka, kb)
+func (c *Collator) CompareString(a, b string) int {
+       // TODO: skip identical prefixes once we have a fast way to detect if a rune is
+       // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
+       c.iter(0).setInputString(c, a)
+       c.iter(1).setInputString(c, b)
+       if res := c.compare(); res != 0 {
+               return res
+       }
+       if Identity == c.Strength {
+               if a < b {
+                       return -1
+               } else if a > b {
+                       return 1
+               }
+       }
+       return 0
+}
+
+func compareLevel(f func(i *iter) int, a, b *iter) int {
+       a.pce = 0
+       b.pce = 0
+       for {
+               va := f(a)
+               vb := f(b)
+               if va != vb {
+                       if va < vb {
+                               return -1
+                       }
+                       return 1
+               } else if va == 0 {
+                       break
+               }
+       }
+       return 0
  }
  
-func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int {
+func (c *Collator) compare() int {
+       ia, ib := c.iter(0), c.iter(1)
+       // Process primary level
+       if c.Alternate != AltShifted {
+               // TODO: implement script reordering
+               // TODO: special hiragana handling
+               if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
+                       return res
+               }
+       } else {
+               // TODO: handle shifted
+       }
+       if Secondary <= c.Strength {
+               f := (*iter).nextSecondary
+               if c.Backwards {
+                       f = (*iter).prevSecondary
+               }
+               if res := compareLevel(f, ia, ib); res != 0 {
+                       return res
+               }
+       }
+       // TODO: special case handling (Danish?)
+       if Tertiary <= c.Strength || c.CaseLevel {
+               if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
+                       return res
+               }
+               // TODO: Not needed for the default value of AltNonIgnorable?
+               if Quaternary <= c.Strength {
+                       if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
+                               return res
+                       }
+               }
+       }
+       return 0
+}
+
+func (c *Collator) Prefix(s, prefix []byte) int {
         // iterate over s, track bytes consumed.
         return 0
  }
@@ -177,12 +242,11 @@ func (c *Collator) Prefix(buf *Buffer, s, prefix []byte) int {
  // Key returns the collation key for str.
  // Passing the buffer buf may avoid memory allocations.
  // The returned slice will point to an allocation in Buffer and will remain
-// valid until the next call to buf.ResetKeys().
+// valid until the next call to buf.Reset().
  func (c *Collator) Key(buf *Buffer, str []byte) []byte {
         // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
         buf.init()
-       c.getColElems(buf, str)
-       return c.key(buf, buf.ce)
+       return c.key(buf, c.getColElems(str))
  }
  
  // KeyFromString returns the collation key for str.
@@ -192,8 +256,7 @@ func (c *Collator) Key(buf *Buffer, str []byte) []byte {
  func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
         // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
         buf.init()
-       c.getColElemsString(buf, str)
-       return c.key(buf, buf.ce)
+       return c.key(buf, c.getColElemsString(str))
  }
  
  func (c *Collator) key(buf *Buffer, w []colElem) []byte {
@@ -203,35 +266,63 @@ func (c *Collator) key(buf *Buffer, w []colElem) []byte {
         return buf.key[kn:]
  }
  
-func (c *Collator) getColElems(buf *Buffer, str []byte) {
-       i := c.iter()
-       i.src.SetInput(c.f, str)
+func (c *Collator) getColElems(str []byte) []colElem {
+       i := c.iter(0)
+       i.setInput(c, str)
         for !i.done() {
-               buf.ce = i.next(buf.ce)
+               i.next()
         }
+       return i.ce
  }
  
-func (c *Collator) getColElemsString(buf *Buffer, str string) {
-       i := c.iter()
-       i.src.SetInputString(c.f, str)
+func (c *Collator) getColElemsString(str string) []colElem {
+       i := c.iter(0)
+       i.setInputString(c, str)
         for !i.done() {
-               buf.ce = i.next(buf.ce)
+               i.next()
         }
+       return i.ce
  }
  
  type iter struct {
         src        norm.Iter
-       ba         [1024]byte
+       norm       [1024]byte
         buf        []byte
-       t          *table
         p          int
         minBufSize int
+
+       wa  [512]colElem
+       ce  []colElem
+       pce int
+
+       t          *table
         _done, eof bool
  }
  
-func (c *Collator) iter() iter {
-       i := iter{t: c.t, minBufSize: c.t.maxContractLen}
-       i.buf = i.ba[:0]
+func (i *iter) init(c *Collator) {
+       i.t = c.t
+       i.minBufSize = c.t.maxContractLen
+       i.ce = i.wa[:0]
+       i.buf = i.norm[:0]
+}
+
+func (i *iter) reset() {
+       i.ce = i.ce[:0]
+       i.buf = i.buf[:0]
+       i.p = 0
+       i.eof = i.src.Done()
+       i._done = i.eof
+}
+
+func (i *iter) setInput(c *Collator, s []byte) *iter {
+       i.src.SetInput(c.f, s)
+       i.reset()
+       return i
+}
+
+func (i *iter) setInputString(c *Collator, s string) *iter {
+       i.src.SetInputString(c.f, s)
+       i.reset()
         return i
  }
  
@@ -239,7 +330,7 @@ func (i *iter) done() bool {
         return i._done
  }
  
-func (i *iter) next(ce []colElem) []colElem {
+func (i *iter) next() {
         if !i.eof && len(i.buf)-i.p < i.minBufSize {
                 // replenish buffer
                 n := copy(i.buf, i.buf[i.p:])
@@ -250,11 +341,67 @@ func (i *iter) next(ce []colElem) []colElem {
         }
         if i.p == len(i.buf) {
                 i._done = true
-               return ce
+               return
         }
-       ce, sz := i.t.appendNext(ce, i.buf[i.p:])
+       sz := 0
+       i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:])
         i.p += sz
-       return ce
+}
+
+func (i *iter) nextPrimary() int {
+       for {
+               for ; i.pce < len(i.ce); i.pce++ {
+                       if v := i.ce[i.pce].primary(); v != 0 {
+                               i.pce++
+                               return v
+                       }
+               }
+               if i.done() {
+                       return 0
+               }
+               i.next()
+       }
+       panic("should not reach here")
+}
+
+func (i *iter) nextSecondary() int {
+       for ; i.pce < len(i.ce); i.pce++ {
+               if v := i.ce[i.pce].secondary(); v != 0 {
+                       i.pce++
+                       return v
+               }
+       }
+       return 0
+}
+
+func (i *iter) prevSecondary() int {
+       for ; i.pce < len(i.ce); i.pce++ {
+               if v := i.ce[len(i.ce)-i.pce-1].secondary(); v != 0 {
+                       i.pce++
+                       return v
+               }
+       }
+       return 0
+}
+
+func (i *iter) nextTertiary() int {
+       for ; i.pce < len(i.ce); i.pce++ {
+               if v := i.ce[i.pce].tertiary(); v != 0 {
+                       i.pce++
+                       return int(v)
+               }
+       }
+       return 0
+}
+
+func (i *iter) nextQuaternary() int {
+       for ; i.pce < len(i.ce); i.pce++ {
+               if v := i.ce[i.pce].quaternary(); v != 0 {
+                       i.pce++
+                       return v
+               }
+       }
+       return 0
  }
  
  func appendPrimary(key []byte, p int) []byte {
diff --git a/src/pkg/exp/locale/collate/collate_test.go b/src/pkg/exp/locale/collate/collate_test.go

index fa7c30262ac62f15df85e8401a1cb3b889ad99b9..6d6d6bce24c42a738364afca38ea36acd5fdad71 100644 (file)
--- a/src/pkg/exp/locale/collate/collate_test.go
+++ b/src/pkg/exp/locale/collate/collate_test.go
@@ -300,7 +300,7 @@ var keyFromElemTests = []keyFromElemTest{
  func TestKeyFromElems(t *testing.T) {
         buf := collate.Buffer{}
         for i, tt := range keyFromElemTests {
-               buf.ResetKeys()
+               buf.Reset()
                 ws := collate.ProcessWeights(tt.opt.alt, tt.opt.top, tt.in)
                 res := collate.KeyFromElems(tt.opt.collator(), &buf, ws)
                 if len(res) != len(tt.out) {
@@ -325,7 +325,6 @@ func TestGetColElems(t *testing.T) {
                         // error is reported in TestAppendNext
                         continue
                 }
-               buf := collate.Buffer{}
                 // Create one large test per table
                 str := make([]byte, 0, 4000)
                 out := ColElems{}
@@ -336,7 +335,7 @@ func TestGetColElems(t *testing.T) {
                         }
                 }
                 for j, chk := range append(tt.chk, check{string(str), len(str), out}) {
-                       ws := collate.GetColElems(c, &buf, []byte(chk.in)[:chk.n])
+                       ws := collate.GetColElems(c, []byte(chk.in)[:chk.n])
                         if len(ws) != len(chk.out) {
                                 t.Errorf("%d:%d: len(ws) was %d; want %d", i, j, len(ws), len(chk.out))
                                 continue
@@ -404,19 +403,27 @@ type compareTest struct {
  
  var compareTests = []compareTest{
         {"a\u0301", "a", 1},
+       {"a\u0301b", "ab", 1},
         {"a", "a\u0301", -1},
+       {"ab", "a\u0301b", -1},
+       {"bc", "a\u0301c", 1},
+       {"ab", "aB", -1},
         {"a\u0301", "a\u0301", 0},
         {"a", "a", 0},
+       // Only clip prefixes of whole runes.
+       {"\u302E", "\u302F", 1},
+       // Don't clip prefixes when last rune of prefix may be part of contraction.
+       {"a\u035E", "a\u0301\u035F", -1},
+       {"a\u0301\u035Fb", "a\u0301\u035F", -1},
  }
  
  func TestCompare(t *testing.T) {
         c, _ := makeTable(appendNextTests[4].in)
-       buf := collate.Buffer{}
         for i, tt := range compareTests {
-               if res := c.Compare(&buf, []byte(tt.a), []byte(tt.b)); res != tt.res {
+               if res := c.Compare([]byte(tt.a), []byte(tt.b)); res != tt.res {
                         t.Errorf("%d: Compare(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
                 }
-               if res := c.CompareString(&buf, tt.a, tt.b); res != tt.res {
+               if res := c.CompareString(tt.a, tt.b); res != tt.res {
                         t.Errorf("%d: CompareString(%q, %q) == %d; want %d", i, tt.a, tt.b, res, tt.res)
                 }
         }
diff --git a/src/pkg/exp/locale/collate/export.go b/src/pkg/exp/locale/collate/export.go

index 01750dd070e512ff78f0f0be9182afe10c8b5cc6..1915c93963be79b2031f1a3ce052247b1a72f76e 100644 (file)
--- a/src/pkg/exp/locale/collate/export.go
+++ b/src/pkg/exp/locale/collate/export.go
@@ -4,8 +4,6 @@
  
  package collate
  
-import "exp/norm"
-
  // Init is used by type Builder in exp/locale/collate/build/
  // to create Collator instances.  It is for internal use only.
  func Init(data interface{}) *Collator {
@@ -24,11 +22,7 @@ func Init(data interface{}) *Collator {
         t.contractElem = init.ContractElems()
         t.maxContractLen = init.MaxContractLen()
         t.variableTop = init.VariableTop()
-       return &Collator{
-               Strength: Quaternary,
-               f:        norm.NFD,
-               t:        t,
-       }
+       return newCollator(t)
  }
  
  type tableInitializer interface {
diff --git a/src/pkg/exp/locale/collate/export_test.go b/src/pkg/exp/locale/collate/export_test.go

index 09caccac5f5af49999040fb8c2cfba36988ee9ab..3afe66457942ad624c9562226c8c70dd7c455fef 100644 (file)
--- a/src/pkg/exp/locale/collate/export_test.go
+++ b/src/pkg/exp/locale/collate/export_test.go
@@ -72,10 +72,9 @@ func SetTop(c *Collator, top int) {
         c.t.variableTop = uint32(top)
  }
  
-func GetColElems(c *Collator, buf *Buffer, str []byte) []Weights {
-       buf.ResetKeys()
-       c.getColElems(buf, str)
-       return convertToWeights(buf.ce)
+func GetColElems(c *Collator, str []byte) []Weights {
+       ce := c.getColElems(str)
+       return convertToWeights(ce)
  }
  
  func ProcessWeights(h AlternateHandling, top int, w []Weights) []Weights {
diff --git a/src/pkg/exp/locale/collate/maketables.go b/src/pkg/exp/locale/collate/maketables.go

index 80e3c394f1817b34cadcb0b0edfd89aff024b1b1..40bf10ab449d36f6f23c63c0c98bf9b153f2f969 100644 (file)
--- a/src/pkg/exp/locale/collate/maketables.go
+++ b/src/pkg/exp/locale/collate/maketables.go
@@ -180,7 +180,7 @@ func skipAlt(a string) bool {
  
  func failOnError(e error) {
         if e != nil {
-               log.Fatal(e)
+               log.Panic(e)
         }
  }
  
@@ -677,7 +677,7 @@ func testCollator(c *collate.Collator) {
                 if bytes.Compare(k0, k) != 0 {
                         failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
                 }
-               buf.ResetKeys()
+               buf.Reset()
         }
         fmt.Println("PASS")
  }
diff --git a/src/pkg/exp/locale/collate/regtest.go b/src/pkg/exp/locale/collate/regtest.go

index 14a447c1e4e7cf5c8b7c63b58236f1d96b073cee..e30915ed894aba9268917673debab3b29d1bff79 100644 (file)
--- a/src/pkg/exp/locale/collate/regtest.go
+++ b/src/pkg/exp/locale/collate/regtest.go
@@ -236,9 +236,9 @@ func doTest(t Test) {
         if strings.Contains(t.name, "NON_IGNOR") {
                 c.Alternate = collate.AltNonIgnorable
         }
-
         prev := t.str[0]
         for i := 1; i < len(t.str); i++ {
+               b.Reset()
                 s := t.str[i]
                 ka := c.Key(b, prev)
                 kb := c.Key(b, s)
@@ -247,10 +247,10 @@ func doTest(t Test) {
                         prev = s
                         continue
                 }
-               if r := c.Compare(b, prev, s); r == 1 {
+               if r := c.Compare(prev, s); r == 1 {
                         fail(t, "%d: Compare(%.4X, %.4X) == %d; want -1 or 0", i, runes(prev), runes(s), r)
                 }
-               if r := c.Compare(b, s, prev); r == -1 {
+               if r := c.Compare(s, prev); r == -1 {
                         fail(t, "%d: Compare(%.4X, %.4X) == %d; want 1 or 0", i, runes(s), runes(prev), r)
                 }
                 prev = s
diff --git a/src/pkg/exp/locale/collate/table_test.go b/src/pkg/exp/locale/collate/table_test.go

index 446d592b6d2eff18712310667ffce9e85aa3e790..95b74fdc51f8e42ed366532891ce08a878afb288 100644 (file)
--- a/src/pkg/exp/locale/collate/table_test.go
+++ b/src/pkg/exp/locale/collate/table_test.go
@@ -141,7 +141,7 @@ var appendNextTests = []tableTest{
                         {"\u0316", [][]int{{0, 220}}},
                         {"\u0317", [][]int{{0, 220}, {0, 220}}},
                         {"\u302D", [][]int{{0, 222}}},
-                       {"\u302E", [][]int{{0, 224}}}, // used as starter
+                       {"\u302E", [][]int{{0, 225}}}, // used as starter
                         {"\u302F", [][]int{{0, 224}}}, // used as starter
                         {"\u18A9", [][]int{{0, 228}}},
                         {"\u0300", [][]int{{0, 230}}},
@@ -169,7 +169,7 @@ var appendNextTests = []tableTest{
                         {"a\u035Db\u035D", [][]int{{117}}},
                         {"a\u0301\u035Db", [][]int{{120}}},
                         {"a\u0301\u035F", [][]int{{121}}},
-                       {"a\u0301\u035Fb", [][]int{{122}}},
+                       {"a\u0301\u035Fb", [][]int{{119}}},
                         {"\u03B1\u0345", [][]int{{901}, {902}}},
                         {"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}},
                         {"\u302F\u18A9", [][]int{{0, 130}}},
@@ -192,7 +192,7 @@ var appendNextTests = []tableTest{
                         // multiple gaps
                         {"a\u0301\u035Db", 6, ColElems{w(120)}},
                         {"a\u0301\u035F", 5, ColElems{w(121)}},
-                       {"a\u0301\u035Fb", 6, ColElems{w(122)}},
+                       {"a\u0301\u035Fb", 6, ColElems{w(119)}},
                         {"a\u0316\u0301\u035F", 7, ColElems{w(121), w(0, 220)}},
                         {"a\u0301\u0315\u035Fb", 7, ColElems{w(121), w(0, 232)}},
                         {"a\u0316\u0301\u0315\u035Db", 5, ColElems{w(102), w(0, 220)}},
diff --git a/src/pkg/exp/locale/collate/tools/colcmp/col.go b/src/pkg/exp/locale/collate/tools/colcmp/col.go

index 26e015cb2f97c1dd9a9ccd57ba99d8e5462a7cce..3f8d7eed655835d5ff084dc538e17e17d4c8c233 100644 (file)
--- a/src/pkg/exp/locale/collate/tools/colcmp/col.go
+++ b/src/pkg/exp/locale/collate/tools/colcmp/col.go
@@ -91,5 +91,5 @@ func (c *goCollator) Key(b Input) []byte {
  }
  
  func (c *goCollator) Compare(a, b Input) int {
-       return c.c.Compare(&c.buf, a.UTF8, b.UTF8)
+       return c.c.Compare(a.UTF8, b.UTF8)
  }
author	Marcel van Lohuizen <mpvl@golang.org>
	Thu, 15 Nov 2012 21:23:56 +0000 (22:23 +0100)
committer	Marcel van Lohuizen <mpvl@golang.org>
	Thu, 15 Nov 2012 21:23:56 +0000 (22:23 +0100)
src/pkg/exp/locale/collate/collate.go		patch \| blob \| history
src/pkg/exp/locale/collate/collate_test.go		patch \| blob \| history
src/pkg/exp/locale/collate/export.go		patch \| blob \| history
src/pkg/exp/locale/collate/export_test.go		patch \| blob \| history
src/pkg/exp/locale/collate/maketables.go		patch \| blob \| history
src/pkg/exp/locale/collate/regtest.go		patch \| blob \| history
src/pkg/exp/locale/collate/table_test.go		patch \| blob \| history
src/pkg/exp/locale/collate/tools/colcmp/col.go		patch \| blob \| history