--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package collate contains types for comparing and sorting Unicode strings
+// according to a given collation order. Package locale provides a high-level
+// interface to collation. Users should typically use that package instead.
+package collate
+
+import (
+ "exp/norm"
+)
+
+// Level identifies the collation comparison level.
+// The primary level corresponds to the basic sorting of text.
+// The secondary level corresponds to accents and related linguistic elements.
+// The tertiary level corresponds to casing and related concepts.
+// The quaternary level is derived from the other levels by the
+// various algorithms for handling variable elements.
+type Level int
+
+const (
+ Primary Level = iota
+ Secondary
+ Tertiary
+ Quaternary
+ Identity
+)
+
+// AlternateHandling identifies the various ways in which variables are handled.
+// A rune with a primary weight lower than the variable top is considered a
+// variable.
+// See http://www.unicode.org/reports/tr10/#Variable_Weighting for details.
+type AlternateHandling int
+
+const (
+ // AltShifted sets variables to be ignorable for levels one through three and
+ // adds a fourth level based on the values of the ignored levels.
+ AltShifted AlternateHandling = iota
+
+ // AltNonIgnorable turns off special handling of variables.
+ AltNonIgnorable
+
+ // AltBlanked sets variables and all subsequent primary ignorables to be
+ // ignorable at all levels. This is identical to removing all variables
+ // and subsequent primary ignorables from the input.
+ AltBlanked
+
+ // AltShiftTrimmed is a slight variant of AltShifted that is used to
+ // emulate POSIX.
+ AltShiftTrimmed
+)
+
+// Collator provides functionality for comparing strings for a given
+// collation order.
+type Collator struct {
+ // See SetVariableTop.
+ variableTop uint32
+
+ // Strength sets the maximum level to use in comparison.
+ Strength Level
+
+ // Alternate specifies an alternative handling of variables.
+ Alternate AlternateHandling
+
+ // Backwards specifies the order of sorting at the secondary level.
+ // This option exists predominantly to support reverse sorting of accents in French.
+ Backwards bool
+
+ // With HiraganaQuaternary enabled, Hiragana codepoints will get lower values
+ // than all the other non-variable code points. Strength must be greater or
+ // equal to Quaternary for this to take effect.
+ HiraganaQuaternary bool
+
+ // If CaseLevel is true, a level consisting only of case characteristics will
+ // be inserted in front of the tertiary level. To ignore accents but take
+ // cases into account, set Strength to Primary and CaseLevel to true.
+ CaseLevel bool
+
+ // If Numeric is true, any sequence of decimal digits (category is Nd) is sorted
+ // at a primary level with its numeric value. For example, "A-21" < "A-123".
+ Numeric bool
+
+ f norm.Form
+
+ t *table
+}
+
+// SetVariableTop sets all runes with primary strength less than the primary
+// strength of r to be variable and thus affected by alternate handling.
+func (c *Collator) SetVariableTop(r rune) {
+ // TODO: implement
+}
+
+var (
+ Root = Collator{}
+)
+
+// Buffer holds reusable buffers that can be used during collation.
+// Reusing a Buffer for the various calls that accept it may avoid
+// unnecessary memory allocations.
+type Buffer struct {
+ // TODO: try various parameters and techniques, such as using
+ // a chan of buffers for a pool.
+ ba [4096]byte
+ wa [512]weights
+ key []byte
+ ce []weights
+}
+
+func (b *Buffer) init() {
+ if b.ce == nil {
+ b.ce = b.wa[:0]
+ b.key = b.ba[:0]
+ } else {
+ b.ce = b.ce[:0]
+ }
+}
+
+// ResetKeys clears the buffer used for generated keys. Calling ResetKeys
+// invalidates keys previously obtained from Key or KeyFromString.
+func (b *Buffer) ResetKeys() {
+ b.ce = b.ce[:0]
+ b.key = b.key[:0]
+}
+
+// Compare returns an integer comparing the two byte slices.
+// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
+func (c *Collator) Compare(buf *Buffer, a, b []byte) int {
+ // TODO: implement
+ return 0
+}
+
+// CompareString returns an integer comparing the two strings.
+// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
+func (c *Collator) CompareString(buf *Buffer, a, b string) int {
+ // TODO: implement
+ return 0
+}
+
+// Key returns the collation key for str.
+// Passing the buffer buf may avoid memory allocations.
+// The returned slice will point to an allocation in Buffer and will retain
+// valid until the next call to buf.ResetKeys().
+func (c *Collator) Key(buf *Buffer, str []byte) []byte {
+ // TODO: implement
+ return nil
+}
+
+// KeyFromString returns the collation key for str.
+// Passing the buffer buf may avoid memory allocations.
+// The returned slice will point to an allocation in Buffer and will retain
+// valid until the next call to buf.ResetKeys().
+func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
+ // TODO: implement
+ return nil
+}
--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package collate
+
+import (
+ "exp/norm"
+ "unicode/utf8"
+)
+
+// table holds all collation data for a given collation ordering.
+type table struct {
+ index trie // main trie
+
+ // expansion info
+ expandElem []uint32
+
+ // contraction info
+ contractTries contractTrieSet
+ contractElem []uint32
+ maxContractLen int
+}
+
+// appendNext appends the weights corresponding to the next rune or
+// contraction in s. If a contraction is matched to a discontinuous
+// sequence of runes, the weights for the interstitial runes are
+// appended as well. It returns a new slice that includes the appended
+// weights and the number of bytes consumed from s.
+func (t *table) appendNext(w []weights, s []byte) ([]weights, int) {
+ v, sz := t.index.lookup(s)
+ ce := colElem(v)
+ tp := ce.ctype()
+ if tp == ceNormal {
+ w = append(w, getWeights(ce, s))
+ } else if tp == ceExpansionIndex {
+ w = t.appendExpansion(w, ce)
+ } else if tp == ceContractionIndex {
+ n := 0
+ w, n = t.matchContraction(w, ce, s[sz:])
+ sz += n
+ } else if tp == ceDecompose {
+ // Decompose using NFCK and replace tertiary weights.
+ t1, t2 := splitDecompose(ce)
+ i := len(w)
+ nfkd := norm.NFKD.Properties(s).Decomposition()
+ for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
+ w, p = t.appendNext(w, nfkd)
+ }
+ w[i].tertiary = t1
+ if i++; i < len(w) {
+ w[i].tertiary = t2
+ for i++; i < len(w); i++ {
+ w[i].tertiary = maxTertiary
+ }
+ }
+ }
+ return w, sz
+}
+
+func getWeights(ce colElem, s []byte) weights {
+ if ce == 0 { // implicit
+ r, _ := utf8.DecodeRune(s)
+ return weights{
+ primary: uint32(implicitPrimary(r)),
+ secondary: defaultSecondary,
+ tertiary: defaultTertiary,
+ }
+ }
+ return splitCE(ce)
+}
+
+func (t *table) appendExpansion(w []weights, ce colElem) []weights {
+ i := splitExpandIndex(ce)
+ n := int(t.expandElem[i])
+ i++
+ for _, ce := range t.expandElem[i : i+n] {
+ w = append(w, splitCE(colElem(ce)))
+ }
+ return w
+}
+
+func (t *table) matchContraction(w []weights, ce colElem, suffix []byte) ([]weights, int) {
+ index, n, offset := splitContractIndex(ce)
+
+ scan := t.contractTries.scanner(index, n, suffix)
+ buf := [norm.MaxSegmentSize]byte{}
+ bufp := 0
+ p := scan.scan(0)
+
+ if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
+ // By now we should have filtered most cases.
+ p0 := p
+ bufn := 0
+ rune := norm.NFC.Properties(suffix[p:])
+ p += rune.Size()
+ if prevCC := rune.TrailCCC(); prevCC != 0 {
+ // A gap may only occur in the last normalization segment.
+ // This also ensures that len(scan.s) < norm.MaxSegmentSize.
+ if end := norm.NFC.FirstBoundary(suffix[p:]); end != -1 {
+ scan.s = suffix[:p+end]
+ }
+ for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
+ rune = norm.NFC.Properties(suffix[p:])
+ if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
+ break
+ }
+ prevCC = rune.TrailCCC()
+ if pp := scan.scan(p); pp != p {
+ // Copy the interstitial runes for later processing.
+ bufn += copy(buf[bufn:], suffix[p0:p])
+ if scan.pindex == pp {
+ bufp = bufn
+ }
+ p, p0 = pp, pp
+ } else {
+ p += rune.Size()
+ }
+ }
+ }
+ }
+ // Append weights for the matched contraction, which may be an expansion.
+ i, n := scan.result()
+ ce = colElem(t.contractElem[i+offset])
+ if ce.ctype() == ceNormal {
+ w = append(w, splitCE(ce))
+ } else {
+ w = t.appendExpansion(w, ce)
+ }
+ // Append weights for the runes in the segment not part of the contraction.
+ for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
+ w, p = t.appendNext(w, b)
+ }
+ return w, n
+}
--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package collate_test
+
+import (
+ "exp/locale/collate"
+ "exp/locale/collate/build"
+ "exp/norm"
+ "testing"
+)
+
+type Weights struct {
+ collate.Weights
+}
+
+type input struct {
+ str string
+ ces [][]int
+}
+
+type check struct {
+ in string
+ n int
+ out []Weights
+}
+
+type tableTest struct {
+ in []input
+ chk []check
+}
+
+func w(ce ...int) Weights {
+ return Weights{collate.W(ce...)}
+}
+
+var defaults = w(0)
+
+func pt(p, t int) []int {
+ return []int{p, defaults.Secondary, t}
+}
+
+func makeTable(in []input) (*collate.Collator, error) {
+ b := build.NewBuilder()
+ for _, r := range in {
+ b.Add([]rune(r.str), r.ces)
+ }
+ return b.Build("")
+}
+
+// modSeq holds a seqeunce of modifiers in increasing order of CCC long enough
+// to cause a segment overflow if not handled correctly. The last rune in this
+// list has a CCC of 214.
+var modSeq = []rune{
+ 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BB,
+ 0x05BC, 0x05BD, 0x05BF, 0x05C1, 0x05C2, 0xFB1E, 0x064B, 0x064C, 0x064D, 0x064E,
+ 0x064F, 0x0650, 0x0651, 0x0652, 0x0670, 0x0711, 0x0C55, 0x0C56, 0x0E38, 0x0E48,
+ 0x0EB8, 0x0EC8, 0x0F71, 0x0F72, 0x0F74, 0x0321, 0x1DCE,
+}
+
+var mods []input
+var modW = func() []Weights {
+ ws := []Weights{}
+ for _, r := range modSeq {
+ rune := norm.NFC.PropertiesString(string(r))
+ ws = append(ws, w(0, int(rune.CCC())))
+ mods = append(mods, input{string(r), [][]int{{0, int(rune.CCC())}}})
+ }
+ return ws
+}()
+
+var appendNextTests = []tableTest{
+ { // test getWeights
+ []input{
+ {"a", [][]int{{100}}},
+ {"b", [][]int{{105}}},
+ {"c", [][]int{{110}}},
+ {"ß", [][]int{{120}}},
+ },
+ []check{
+ {"a", 1, []Weights{w(100)}},
+ {"b", 1, []Weights{w(105)}},
+ {"c", 1, []Weights{w(110)}},
+ {"d", 1, []Weights{w(0x4FBA4)}},
+ {"ab", 1, []Weights{w(100)}},
+ {"bc", 1, []Weights{w(105)}},
+ {"dd", 1, []Weights{w(0x4FBA4)}},
+ {"ß", 2, []Weights{w(120)}},
+ },
+ },
+ { // test expansion
+ []input{
+ {"u", [][]int{{100}}},
+ {"U", [][]int{{100}, {0, 25}}},
+ {"w", [][]int{{100}, {100}}},
+ {"W", [][]int{{100}, {0, 25}, {100}, {0, 25}}},
+ },
+ []check{
+ {"u", 1, []Weights{w(100)}},
+ {"U", 1, []Weights{w(100), w(0, 25)}},
+ {"w", 1, []Weights{w(100), w(100)}},
+ {"W", 1, []Weights{w(100), w(0, 25), w(100), w(0, 25)}},
+ },
+ },
+ { // test decompose
+ []input{
+ {"D", [][]int{pt(104, 8)}},
+ {"z", [][]int{pt(130, 8)}},
+ {"\u030C", [][]int{{0, 40}}}, // Caron
+ {"\u01C5", [][]int{pt(104, 9), pt(130, 4), {0, 40, 0x1F}}}, // Dž = D+z+caron
+ },
+ []check{
+ {"\u01C5", 2, []Weights{w(pt(104, 9)...), w(pt(130, 4)...), w(0, 40, 0x1F)}},
+ },
+ },
+ { // test basic contraction
+ []input{
+ {"a", [][]int{{100}}},
+ {"ab", [][]int{{101}}},
+ {"aab", [][]int{{101}, {101}}},
+ {"abc", [][]int{{102}}},
+ {"b", [][]int{{200}}},
+ {"c", [][]int{{300}}},
+ {"d", [][]int{{400}}},
+ },
+ []check{
+ {"a", 1, []Weights{w(100)}},
+ {"aa", 1, []Weights{w(100)}},
+ {"aac", 1, []Weights{w(100)}},
+ {"ab", 2, []Weights{w(101)}},
+ {"abb", 2, []Weights{w(101)}},
+ {"aab", 3, []Weights{w(101), w(101)}},
+ {"aaba", 3, []Weights{w(101), w(101)}},
+ {"abc", 3, []Weights{w(102)}},
+ {"abcd", 3, []Weights{w(102)}},
+ {"d", 1, []Weights{w(400)}},
+ },
+ },
+ { // test discontinuous contraction
+ append(mods, []input{
+ // modifiers; secondary weight equals ccc
+ {"\u0316", [][]int{{0, 220}}},
+ {"\u0317", [][]int{{0, 220}, {0, 220}}},
+ {"\u302D", [][]int{{0, 222}}},
+ {"\u302E", [][]int{{0, 224}}}, // used as starter
+ {"\u302F", [][]int{{0, 224}}}, // used as starter
+ {"\u18A9", [][]int{{0, 228}}},
+ {"\u0300", [][]int{{0, 230}}},
+ {"\u0301", [][]int{{0, 230}}},
+ {"\u0315", [][]int{{0, 232}}},
+ {"\u031A", [][]int{{0, 232}}},
+ {"\u035C", [][]int{{0, 233}}},
+ {"\u035F", [][]int{{0, 233}}},
+ {"\u035D", [][]int{{0, 234}}},
+ {"\u035E", [][]int{{0, 234}}},
+ {"\u0345", [][]int{{0, 240}}},
+
+ // starters
+ {"a", [][]int{{100}}},
+ {"b", [][]int{{200}}},
+ {"c", [][]int{{300}}},
+ {"\u03B1", [][]int{{900}}},
+
+ // contractions
+ {"a\u0300", [][]int{{101}}},
+ {"a\u0301", [][]int{{102}}},
+ {"a\u035E", [][]int{{110}}},
+ {"a\u035Eb\u035E", [][]int{{115}}},
+ {"ac\u035Eaca\u035E", [][]int{{116}}},
+ {"a\u035Db\u035D", [][]int{{117}}},
+ {"a\u0301\u035Db", [][]int{{120}}},
+ {"a\u0301\u035F", [][]int{{121}}},
+ {"a\u0301\u035Fb", [][]int{{122}}},
+ {"\u03B1\u0345", [][]int{{901}, {902}}},
+ {"\u302E\u18A9", [][]int{{0, 131}, {0, 132}}},
+ {"\u302F\u18A9", [][]int{{0, 130}}},
+ }...),
+ []check{
+ {"ab", 1, []Weights{w(100)}}, // closing segment
+ {"a\u0316\u0300b", 5, []Weights{w(101), w(0, 220)}}, // closing segment
+ {"a\u0316\u0300", 5, []Weights{w(101), w(0, 220)}}, // no closing segment
+ {"a\u0316\u0300\u035Cb", 5, []Weights{w(101), w(0, 220)}}, // completes before segment end
+ {"a\u0316\u0300\u035C", 5, []Weights{w(101), w(0, 220)}}, // completes before segment end
+
+ {"a\u0316\u0301b", 5, []Weights{w(102), w(0, 220)}}, // closing segment
+ {"a\u0316\u0301", 5, []Weights{w(102), w(0, 220)}}, // no closing segment
+ {"a\u0316\u0301\u035Cb", 5, []Weights{w(102), w(0, 220)}}, // completes before segment end
+ {"a\u0316\u0301\u035C", 5, []Weights{w(102), w(0, 220)}}, // completes before segment end
+
+ // match blocked by modifier with same ccc
+ {"a\u0301\u0315\u031A\u035Fb", 3, []Weights{w(102)}},
+
+ // multiple gaps
+ {"a\u0301\u035Db", 6, []Weights{w(120)}},
+ {"a\u0301\u035F", 5, []Weights{w(121)}},
+ {"a\u0301\u035Fb", 6, []Weights{w(122)}},
+ {"a\u0316\u0301\u035F", 7, []Weights{w(121), w(0, 220)}},
+ {"a\u0301\u0315\u035Fb", 7, []Weights{w(121), w(0, 232)}},
+ {"a\u0316\u0301\u0315\u035Db", 5, []Weights{w(102), w(0, 220)}},
+ {"a\u0316\u0301\u0315\u035F", 9, []Weights{w(121), w(0, 220), w(0, 232)}},
+ {"a\u0316\u0301\u0315\u035Fb", 9, []Weights{w(121), w(0, 220), w(0, 232)}},
+ {"a\u0316\u0301\u0315\u035F\u035D", 9, []Weights{w(121), w(0, 220), w(0, 232)}},
+ {"a\u0316\u0301\u0315\u035F\u035Db", 9, []Weights{w(121), w(0, 220), w(0, 232)}},
+
+ // handling of segment overflow
+ { // just fits within segment
+ "a" + string(modSeq[:30]) + "\u0301",
+ 3 + len(string(modSeq[:30])),
+ append([]Weights{w(102)}, modW[:30]...),
+ },
+ {"a" + string(modSeq[:31]) + "\u0301", 1, []Weights{w(100)}}, // overflow
+ {"a" + string(modSeq) + "\u0301", 1, []Weights{w(100)}},
+ { // just fits within segment with two interstitial runes
+ "a" + string(modSeq[:28]) + "\u0301\u0315\u035F",
+ 7 + len(string(modSeq[:28])),
+ append(append([]Weights{w(121)}, modW[:28]...), w(0, 232)),
+ },
+ { // second half does not fit within segment
+ "a" + string(modSeq[:29]) + "\u0301\u0315\u035F",
+ 3 + len(string(modSeq[:29])),
+ append([]Weights{w(102)}, modW[:29]...),
+ },
+
+ // discontinuity can only occur in last normalization segment
+ {"a\u035Eb\u035E", 6, []Weights{w(115)}},
+ {"a\u0316\u035Eb\u035E", 5, []Weights{w(110), w(0, 220)}},
+ {"a\u035Db\u035D", 6, []Weights{w(117)}},
+ {"a\u0316\u035Db\u035D", 1, []Weights{w(100)}},
+ {"a\u035Eb\u0316\u035E", 8, []Weights{w(115), w(0, 220)}},
+ {"a\u035Db\u0316\u035D", 8, []Weights{w(117), w(0, 220)}},
+ {"ac\u035Eaca\u035E", 9, []Weights{w(116)}},
+ {"a\u0316c\u035Eaca\u035E", 1, []Weights{w(100)}},
+ {"ac\u035Eac\u0316a\u035E", 1, []Weights{w(100)}},
+
+ // expanding contraction
+ {"\u03B1\u0345", 4, []Weights{w(901), w(902)}},
+
+ // Theoretical possibilities
+ // contraction within a gap
+ {"a\u302F\u18A9\u0301", 9, []Weights{w(102), w(0, 130)}},
+ // expansion within a gap
+ {"a\u0317\u0301", 5, []Weights{w(102), w(0, 220), w(0, 220)}},
+ {"a\u302E\u18A9\u0301", 9, []Weights{w(102), w(0, 131), w(0, 132)}},
+ {
+ "a\u0317\u302E\u18A9\u0301",
+ 11,
+ []Weights{w(102), w(0, 220), w(0, 220), w(0, 131), w(0, 132)},
+ },
+ },
+ },
+}
+
+func TestAppendNext(t *testing.T) {
+ for i, tt := range appendNextTests {
+ c, err := makeTable(tt.in)
+ if err != nil {
+ t.Errorf("%d: error creating table: %v", i, err)
+ continue
+ }
+ ct := collate.GetTable(c)
+ for j, chk := range tt.chk {
+ ws, n := ct.AppendNext([]byte(chk.in))
+ if n != chk.n {
+ t.Errorf("%d:%d: bytes consumed was %d; want %d", i, j, n, chk.n)
+ }
+ if len(ws) != len(chk.out) {
+ t.Errorf("%d:%d: len(ws) was %d; want %d (%v vs %v)\n%X", i, j, len(ws), len(chk.out), ws, chk.out, chk.in)
+ continue
+ }
+ for k, w := range ws {
+ if w != chk.out[k].Weights {
+ t.Errorf("%d:%d: Weights %d was %v; want %v", i, j, k, w, chk.out[k])
+ }
+ }
+ }
+ }
+}