"log"
"sort"
"strings"
+ "unicode/utf8"
)
// TODO: optimizations:
// - trie valueBlocks are currently 100K. There are a lot of sparse blocks
// and many consecutive values with the same stride. This can be further
// compacted.
-// - compress secondary weights into 8 bits.
+// - Compress secondary weights into 8 bits.
+// - Some LDML specs specify a context element. Currently we simply concatenate
+// those. Context can be implemented using the contraction trie. If Builder
+// could analyze and detect when using a context makes sense, there is no
+// need to expose this construct in the API.
// entry is used to keep track of a single entry in the collation element table
// during building. Examples of entries can be found in the Default Unicode
return e.contractionHandle.n != 0
}
-// A Builder builds collation tables. It can generate both the root table and
-// locale-specific tables defined as tailorings to the root table.
-// The typical use case is to specify the data for the root table and all locale-specific
-// tables using Add and AddTailoring before making any call to Build. This allows
-// Builder to ensure that a root table can support tailorings for each locale.
+// A Builder builds a root collation table. The user must specify the
+// collation elements for each entry. A common use will be to base the weights
+// on those specified in the allkeys* file as provided by the UCA or CLDR.
type Builder struct {
index *trieBuilder
+ locale []*Tailoring
entryMap map[string]*entry
entry []*entry
t *table
err error
built bool
+
+ minNonVar int // lowest primary recorded for a variable
+ varTop int // highest primary recorded for a non-variable
+}
+
+// A Tailoring builds a collation table based on another collation table.
+// The table is defined by specifying tailorings to the underlying table.
+// See http://unicode.org/reports/tr35/ for an overview of tailoring
+// collation tables. The CLDR contains pre-defined tailorings for a variety
+// of languages (See http://www.unicode.org/Public/cldr/2.0.1/core.zip.)
+type Tailoring struct {
+ id string
+ // TODO: implement.
}
// NewBuilder returns a new Builder.
return b
}
-// Add adds an entry for the root collation element table, mapping
+// Tailoring returns a Tailoring for the given locale. One should
+// have completed all calls to Add before calling Tailoring.
+func (b *Builder) Tailoring(locale string) *Tailoring {
+ t := &Tailoring{
+ id: locale,
+ }
+ b.locale = append(b.locale, t)
+ return t
+}
+
+// Add adds an entry to the collation element table, mapping
// a slice of runes to a sequence of collation elements.
// A collation element is specified as list of weights: []int{primary, secondary, ...}.
// The entries are typically obtained from a collation element table
// as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format.
// Note that the collation elements specified by colelems are only used
// as a guide. The actual weights generated by Builder may differ.
-func (b *Builder) Add(str []rune, colelems [][]int) error {
+// The argument variables is a list of indices into colelems that should contain
+// a value for each colelem that is a variable. (See the reference above.)
+func (b *Builder) Add(str []rune, colelems [][]int, variables []int) error {
e := &entry{
runes: make([]rune, len(str)),
elems: make([][]int, len(colelems)),
e.elems[i] = append(e.elems[i], ce[0])
}
}
+ for i, ce := range e.elems {
+ isvar := false
+ for _, j := range variables {
+ if i == j {
+ isvar = true
+ }
+ }
+ if isvar {
+ if ce[0] >= b.minNonVar && b.minNonVar > 0 {
+ return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", ce[0], b.minNonVar)
+ }
+ if ce[0] > b.varTop {
+ b.varTop = ce[0]
+ }
+ } else if ce[0] > 0 {
+ if ce[0] <= b.varTop {
+ return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", ce[0], b.varTop)
+ }
+ if b.minNonVar == 0 || ce[0] < b.minNonVar {
+ b.minNonVar = ce[0]
+ }
+ }
+ }
elems, err := convertLargeWeights(e.elems)
if err != nil {
return err
return nil
}
-// AddTailoring defines a tailoring x <_level y for the given locale.
-// For example, AddTailoring("se", "z", "ä", Primary) sorts "ä" after "z"
-// at the primary level for Swedish. AddTailoring("de", "ue", "ü", Secondary)
-// sorts "ü" after "ue" at the secondary level for German.
+// SetAnchor sets the point after which elements passed in subsequent calls to
+// Insert will be inserted. It is equivalent to the reset directive in an LDML
+// specification. See Insert for an example.
+// SetAnchor supports the following logical reset positions:
+// <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>,
+// and <last_non_ignorable/>.
+func (t *Tailoring) SetAnchor(anchor string) error {
+ // TODO: implement.
+ return nil
+}
+
+// SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
+// Insert will insert entries before the anchor.
+func (t *Tailoring) SetAnchorBefore(anchor string) error {
+ // TODO: implement.
+ return nil
+}
+
+// Insert sets the ordering of str relative to the entry set by the previous
+// call to SetAnchor or Insert. The argument extend corresponds
+// to the extend elements as defined in LDML. A non-empty value for extend
+// will cause the collation elements corresponding to extend to be appended
+// to the collation elements generated for the entry added by Insert.
+// This has the same net effect as sorting str after the string anchor+extend.
// See http://www.unicode.org/reports/tr10/#Tailoring_Example for details
-// on parametric tailoring.
-func (b *Builder) AddTailoring(locale, x, y string, l collate.Level) error {
+// on parametric tailoring and http://unicode.org/reports/tr35/#Collation_Elements
+// for full details on LDML.
+//
+// Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
+// at the primary sorting level:
+// t := b.Tailoring("se")
+// t.SetAnchor("z")
+// t.Insert(collate.Primary, "ä", "")
+// Order "ü" after "ue" at the secondary sorting level:
+// t.SetAnchor("ue")
+// t.Insert(collate.Secondary, "ü","")
+// or
+// t.SetAnchor("u")
+// t.Insert(collate.Secondary, "ü", "e")
+// Order "q" afer "ab" at the secondary level and "Q" after "q"
+// at the tertiary level:
+// t.SetAnchor("ab")
+// t.Insert(collate.Secondary, "q", "")
+// t.Insert(collate.Tertiary, "Q", "")
+// Order "b" before "a":
+// t.SetAnchorBefore("a")
+// t.Insert(collate.Primary, "b", "")
+// Order "0" after the last primary ignorable:
+// t.SetAnchor("<last_primary_ignorable/>")
+// t.Insert(collate.Primary, "0", "")
+func (t *Tailoring) Insert(level collate.Level, str, extend string) error {
// TODO: implement.
return nil
}
func (b *Builder) build() (*table, error) {
if !b.built {
b.built = true
- b.t = &table{}
+ b.t = &table{
+ maxContractLen: utf8.UTFMax,
+ variableTop: uint32(b.varTop),
+ }
b.simplify()
b.processExpansions() // requires simplify
return b.t, nil
}
-// Build builds a Collator for the given locale. To build the root table, set locale to "".
-func (b *Builder) Build(locale string) (*collate.Collator, error) {
+// Build builds the root Collator.
+func (b *Builder) Build() (*collate.Collator, error) {
t, err := b.build()
if err != nil {
return nil, err
}
- // TODO: support multiple locales
return collate.Init(t), nil
}
-// Print prints all tables to a Go file that can be included in
-// the Collate package.
+// Build builds a Collator for Tailoring t.
+func (t *Tailoring) Build() (*collate.Collator, error) {
+ // TODO: implement.
+ return nil, nil
+}
+
+// Print prints the tables for b and all its Tailorings as a Go file
+// that can be included in the Collate package.
func (b *Builder) Print(w io.Writer) (int, error) {
t, err := b.build()
if err != nil {
false,
"data files have been copied to the current directory; for debugging only")
-func failonerror(e error) {
+func failOnError(e error) {
if e != nil {
log.Fatal(e)
}
// parseUCA parses a Default Unicode Collation Element Table of the format
// specified in http://www.unicode.org/reports/tr10/#File_Format.
// It returns the variable top.
-func parseUCA(builder *build.Builder) int {
- maxVar, minNonVar := 0, 1<<30
+func parseUCA(builder *build.Builder) {
r, err := openReader(*ducet)
- failonerror(err)
+ failOnError(err)
defer r.Close()
input := bufio.NewReader(r)
colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
lhs = append(lhs, rune(convHex(i, v)))
}
var n int
+ var vars []int
rhs := [][]int{}
- for _, m := range colelem.FindAllStringSubmatch(part[1], -1) {
+ for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
n += len(m[0])
elem := []int{}
for _, h := range strings.Split(m[2], ".") {
elem = append(elem, convHex(i, h))
}
- if p := elem[0]; m[1] == "*" {
- if p > maxVar {
- maxVar = p
- }
- } else if p > 0 && p < minNonVar {
- minNonVar = p
+ if m[1] == "*" {
+ vars = append(vars, i)
}
rhs = append(rhs, elem)
}
if len(part[1]) < n+3 || part[1][n+1] != '#' {
log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
}
- builder.Add(lhs, rhs)
+ failOnError(builder.Add(lhs, rhs, vars))
}
}
- if maxVar >= minNonVar {
- log.Fatalf("found maxVar > minNonVar (%d > %d)", maxVar, minNonVar)
- }
- return maxVar
}
func convHex(line int, s string) int {
}
// TODO: move this functionality to exp/locale/collate/build.
-func printCollators(c *collate.Collator, vartop int) {
+func printCollators(c *collate.Collator) {
const name = "Root"
fmt.Printf("var _%s = Collator{\n", name)
fmt.Printf("\tStrength: %v,\n", c.Strength)
- fmt.Printf("\tvariableTop: 0x%X,\n", vartop)
fmt.Printf("\tf: norm.NFD,\n")
fmt.Printf("\tt: &%sTable,\n", strings.ToLower(name))
fmt.Printf("}\n\n")
func main() {
flag.Parse()
b := build.NewBuilder()
- vartop := parseUCA(b)
- _, err := b.Build("")
- failonerror(err)
+ parseUCA(b)
+ c, err := b.Build()
+ failOnError(err)
fmt.Println("// Generated by running")
fmt.Printf("// maketables --ducet=%s\n", *ducet)
fmt.Println(`import "exp/norm"`)
fmt.Println("")
- c := &collate.Collator{}
- c.Strength = collate.Quaternary
- printCollators(c, vartop)
+ printCollators(c)
_, err = b.Print(os.Stdout)
- failonerror(err)
+ failOnError(err)
}