From fdce27f7b8f671e3399d2c116dbd15ec2e612af2 Mon Sep 17 00:00:00 2001 From: Marcel van Lohuizen Date: Wed, 25 Apr 2012 13:19:35 +0200 Subject: [PATCH] exp/locale/collate: Added Builder type for generating a complete collation table. At this moment, it only implements the generation of a root table. R=r CC=golang-dev https://golang.org/cl/6039047 --- src/pkg/exp/locale/collate/build/builder.go | 501 ++++++++++++++++++ .../exp/locale/collate/build/builder_test.go | 265 +++++++++ 2 files changed, 766 insertions(+) create mode 100644 src/pkg/exp/locale/collate/build/builder.go create mode 100644 src/pkg/exp/locale/collate/build/builder_test.go diff --git a/src/pkg/exp/locale/collate/build/builder.go b/src/pkg/exp/locale/collate/build/builder.go new file mode 100644 index 0000000000..fa9661cf32 --- /dev/null +++ b/src/pkg/exp/locale/collate/build/builder.go @@ -0,0 +1,501 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package build + +import ( + "exp/locale/collate" + "exp/norm" + "fmt" + "io" + "log" + "sort" + "strings" +) + +// TODO: optimizations: +// - expandElem is currently 20K. By putting unique colElems in a separate +// table and having a byte array of indexes into this table, we can reduce +// the total size to about 7K. By also factoring out the length bytes, we +// can reduce this to about 6K. +// - trie valueBlocks are currently 100K. There are a lot of sparse blocks +// and many consecutive values with the same stride. This can be further +// compacted. + +// entry is used to keep track of a single entry in the collation element table +// during building. Examples of entries can be found in the Default Unicode +// Collation Element Table. +// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt. +type entry struct { + runes []rune + elems [][]int // the collation elements for runes + str string // same as string(runes) + + decompose bool // can use NFKD decomposition to generate elems + expansionIndex int // used to store index into expansion table + contractionHandle ctHandle + contractionIndex int // index into contraction elements +} + +func (e *entry) String() string { + return fmt.Sprintf("%X -> %X (ch:%x; ci:%d, ei:%d)", + e.runes, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex) +} + +func (e *entry) skip() bool { + return e.contraction() +} + +func (e *entry) expansion() bool { + return !e.decompose && len(e.elems) > 1 +} + +func (e *entry) contraction() bool { + return len(e.runes) > 1 +} + +func (e *entry) contractionStarter() bool { + return e.contractionHandle.n != 0 +} + +// A Builder builds collation tables. It can generate both the root table and +// locale-specific tables defined as tailorings to the root table. +// The typical use case is to specify the data for the root table and all locale-specific +// tables using Add and AddTailoring before making any call to Build. This allows +// Builder to ensure that a root table can support tailorings for each locale. +type Builder struct { + entryMap map[string]*entry + entry []*entry + t *table + err error +} + +// NewBuilder returns a new Builder. +func NewBuilder() *Builder { + b := &Builder{ + entryMap: make(map[string]*entry), + } + return b +} + +// Add adds an entry for the root collation element table, mapping +// a slice of runes to a sequence of collation elements. +// A collation element is specified as list of weights: []int{primary, secondary, ...}. +// The entries are typically obtained from a collation element table +// as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format. +// Note that the collation elements specified by colelems are only used +// as a guide. The actual weights generated by Builder may differ. +func (b *Builder) Add(str []rune, colelems [][]int) error { + e := &entry{ + runes: make([]rune, len(str)), + elems: make([][]int, len(colelems)), + str: string(str), + } + copy(e.runes, str) + for i, ce := range colelems { + e.elems[i] = append(e.elems[i], ce...) + if len(ce) == 0 { + e.elems[i] = append(e.elems[i], []int{0, 0, 0, 0}...) + break + } + if len(ce) == 1 { + e.elems[i] = append(e.elems[i], defaultSecondary) + } + if len(ce) <= 2 { + e.elems[i] = append(e.elems[i], defaultTertiary) + } + if len(ce) <= 3 { + e.elems[i] = append(e.elems[i], ce[0]) + } + } + b.entryMap[string(str)] = e + b.entry = append(b.entry, e) + return nil +} + +// AddTailoring defines a tailoring x <_level y for the given locale. +// For example, AddTailoring("se", "z", "ä", Primary) sorts "ä" after "z" +// at the primary level for Swedish. AddTailoring("de", "ue", "ü", Secondary) +// sorts "ü" after "ue" at the secondary level for German. +// See http://www.unicode.org/reports/tr10/#Tailoring_Example for details +// on parametric tailoring. +func (b *Builder) AddTailoring(locale, x, y string, l collate.Level) error { + // TODO: implement. + return nil +} + +func (b *Builder) baseColElem(e *entry) uint32 { + ce := uint32(0) + var err error + switch { + case e.expansion(): + ce, err = makeExpandIndex(e.expansionIndex) + default: + if e.decompose { + log.Fatal("decompose should be handled elsewhere") + } + ce, err = makeCE(e.elems[0]) + } + if err != nil { + b.error(fmt.Errorf("%s: %X -> %X", err, e.runes, e.elems)) + } + return ce +} + +func (b *Builder) colElem(e *entry) uint32 { + if e.skip() { + log.Fatal("cannot build colElem for entry that should be skipped") + } + ce := uint32(0) + var err error + switch { + case e.decompose: + t1 := e.elems[0][2] + t2 := 0 + if len(e.elems) > 1 { + t2 = e.elems[1][2] + } + ce, err = makeDecompose(t1, t2) + case e.contractionStarter(): + ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex) + default: + if len(e.runes) > 1 { + log.Fatal("colElem: contractions are handled in contraction trie") + } + ce = b.baseColElem(e) + } + if err != nil { + b.error(err) + } + return ce +} + +func (b *Builder) error(e error) { + if e != nil { + b.err = e + } +} + +func (b *Builder) build() (*table, error) { + b.t = &table{} + + b.contractCJK() + b.simplify() // requires contractCJK + b.processExpansions() // requires simplify + b.processContractions() // requires simplify + b.buildTrie() // requires process* + + if b.err != nil { + return nil, b.err + } + return b.t, nil +} + +// Build builds a Collator for the given locale. To build the root table, set locale to "". +func (b *Builder) Build(locale string) (*collate.Collator, error) { + t, err := b.build() + if err != nil { + return nil, err + } + // TODO: support multiple locales + return collate.Init(t), nil +} + +// Print prints all tables to a Go file that can be included in +// the Collate package. +func (b *Builder) Print(w io.Writer) (int, error) { + t, err := b.build() + if err != nil { + return 0, err + } + // TODO: support multiple locales + n, _, err := t.print(w, "root") + return n, err +} + +// reproducibleFromNFKD checks whether the given expansion could be generated +// from an NFKD expansion. +func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool { + // Length must be equal. + if len(exp) != len(nfkd) { + return false + } + for i, ce := range exp { + // Primary and secondary values should be equal. + if ce[0] != nfkd[i][0] || ce[1] != nfkd[i][1] { + return false + } + // Tertiary values should be equal to maxTertiary for third element onwards. + if i >= 2 && ce[2] != maxTertiary { + return false + } + } + return true +} + +func equalCE(a, b []int) bool { + if len(a) != len(b) { + return false + } + for i := 0; i < 3; i++ { + if b[i] != a[i] { + return false + } + } + return true +} + +func equalCEArrays(a, b [][]int) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if !equalCE(a[i], b[i]) { + return false + } + } + return true +} + +// genColElems generates a collation element array from the runes in str. This +// assumes that all collation elements have already been added to the Builder. +func (b *Builder) genColElems(str string) [][]int { + elems := [][]int{} + for _, r := range []rune(str) { + if ee, ok := b.entryMap[string(r)]; !ok { + elem := []int{implicitPrimary(r), defaultSecondary, defaultTertiary, int(r)} + elems = append(elems, elem) + } else { + elems = append(elems, ee.elems...) + } + } + return elems +} + +func (b *Builder) simplify() { + // Runes that are a starter of a contraction should not be removed. + // (To date, there is only Kannada character 0CCA.) + keep := make(map[rune]bool) + for _, e := range b.entry { + if len(e.runes) > 1 { + keep[e.runes[0]] = true + } + } + // Remove entries for which the runes normalize (using NFD) to identical values. + for _, e := range b.entry { + s := e.str + nfd := norm.NFD.String(s) + if len(e.runes) > 1 || keep[e.runes[0]] || nfd == s { + continue + } + if equalCEArrays(b.genColElems(nfd), e.elems) { + delete(b.entryMap, s) + } + } + // Remove entries in b.entry that were removed from b.entryMap + k := 0 + for _, e := range b.entry { + if _, ok := b.entryMap[e.str]; ok { + b.entry[k] = e + k++ + } + } + b.entry = b.entry[:k] + // Tag entries for which the runes NFKD decompose to identical values. + for _, e := range b.entry { + s := e.str + nfkd := norm.NFKD.String(s) + if len(e.runes) > 1 || keep[e.runes[0]] || nfkd == s { + continue + } + if reproducibleFromNFKD(e, e.elems, b.genColElems(nfkd)) { + e.decompose = true + } + } +} + +// convertLargeWeights converts collation elements with large +// primaries (either double primaries or for illegal runes) +// to our own representation. +// See http://unicode.org/reports/tr10/#Implicit_Weights +func convertLargeWeights(elems [][]int) (res [][]int, err error) { + const ( + firstLargePrimary = 0xFB40 + illegalPrimary = 0xFFFE + highBitsMask = 0x3F + lowBitsMask = 0x7FFF + lowBitsFlag = 0x8000 + shiftBits = 15 + ) + for i := 0; i < len(elems); i++ { + ce := elems[i] + p := ce[0] + if p < firstLargePrimary { + continue + } + if p >= illegalPrimary { + ce[0] = illegalOffset + p - illegalPrimary + } else { + if i+1 >= len(elems) { + return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) + } + if elems[i+1][0]&lowBitsFlag == 0 { + return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) + } + r := rune(((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask) + ce[0] = implicitPrimary(r) + for j := i + 1; j+1 < len(elems); j++ { + elems[j] = elems[j+1] + } + elems = elems[:len(elems)-1] + } + } + return elems, nil +} + +// A CJK character C is represented in the DUCET as +// [.FBxx.0020.0002.C][.BBBB.0000.0000.C] +// We will rewrite these characters to a single CE. +// We assume the CJK values start at 0x8000. +func (b *Builder) contractCJK() { + for _, e := range b.entry { + elms, err := convertLargeWeights(e.elems) + e.elems = elms + if err != nil { + err = fmt.Errorf("%U: %s", e.runes, err) + } + b.error(err) + } +} + +// appendExpansion converts the given collation sequence to +// collation elements and adds them to the expansion table. +// It returns an index to the expansion table. +func (b *Builder) appendExpansion(e *entry) int { + t := b.t + i := len(t.expandElem) + ce := uint32(len(e.elems)) + t.expandElem = append(t.expandElem, ce) + for _, w := range e.elems { + ce, err := makeCE(w) + if err != nil { + b.error(err) + return -1 + } + t.expandElem = append(t.expandElem, ce) + } + return i +} + +// processExpansions extracts data necessary to generate +// the extraction tables. +func (b *Builder) processExpansions() { + eidx := make(map[string]int) + for _, e := range b.entry { + if !e.expansion() { + continue + } + key := fmt.Sprintf("%v", e.elems) + i, ok := eidx[key] + if !ok { + i = b.appendExpansion(e) + eidx[key] = i + } + e.expansionIndex = i + } +} + +func (b *Builder) processContractions() { + // Collate contractions per starter rune. + starters := []rune{} + cm := make(map[rune][]*entry) + for _, e := range b.entry { + if e.contraction() { + r := e.runes[0] + if _, ok := cm[r]; !ok { + starters = append(starters, r) + } + cm[r] = append(cm[r], e) + } + } + // Add entries of single runes that are at a start of a contraction. + for _, e := range b.entry { + if !e.contraction() { + r := e.runes[0] + if _, ok := cm[r]; ok { + cm[r] = append(cm[r], e) + } + } + } + // Build the tries for the contractions. + t := b.t + handlemap := make(map[string]ctHandle) + for _, r := range starters { + l := cm[r] + // Compute suffix strings. There are 31 different contraction suffix + // sets for 715 contractions and 82 contraction starter runes as of + // version 6.0.0. + sufx := []string{} + hasSingle := false + for _, e := range l { + if len(e.runes) > 1 { + sufx = append(sufx, string(e.runes[1:])) + } else { + hasSingle = true + } + } + if !hasSingle { + b.error(fmt.Errorf("no single entry for starter rune %U found", r)) + continue + } + // Unique the suffix set. + sort.Strings(sufx) + key := strings.Join(sufx, "\n") + handle, ok := handlemap[key] + if !ok { + var err error + handle, err = t.contractTries.appendTrie(sufx) + if err != nil { + b.error(err) + } + handlemap[key] = handle + } + // Bucket sort entries in index order. + es := make([]*entry, len(l)) + for _, e := range l { + var o, sn int + if len(e.runes) > 1 { + str := []byte(string(e.runes[1:])) + o, sn = t.contractTries.lookup(handle, str) + if sn != len(str) { + log.Fatalf("processContractions: unexpected length for '%X'; len=%d; want %d", []rune(string(str)), sn, len(str)) + } + } + if es[o] != nil { + log.Fatalf("Multiple contractions for position %d for rune %U", o, e.runes[0]) + } + es[o] = e + } + // Store info in entry for starter rune. + es[0].contractionIndex = len(t.contractElem) + es[0].contractionHandle = handle + // Add collation elements for contractions. + for _, e := range es { + t.contractElem = append(t.contractElem, b.baseColElem(e)) + } + } +} + +func (b *Builder) buildTrie() { + t := newNode() + for _, e := range b.entry { + if !e.skip() { + ce := b.colElem(e) + t.insert(e.runes[0], ce) + } + } + i, err := t.generate() + b.t.index = *i + b.error(err) +} diff --git a/src/pkg/exp/locale/collate/build/builder_test.go b/src/pkg/exp/locale/collate/build/builder_test.go new file mode 100644 index 0000000000..343c7afbfd --- /dev/null +++ b/src/pkg/exp/locale/collate/build/builder_test.go @@ -0,0 +1,265 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package build + +import "testing" + +// cjk returns an implicit collation element for a CJK rune. +func cjk(r rune) [][]int { + // A CJK character C is represented in the DUCET as + // [.AAAA.0020.0002.C][.BBBB.0000.0000.C] + // Where AAAA is the most significant 15 bits plus a base value. + // Any base value will work for the test, so we pick the common value of FB40. + const base = 0xFB40 + return [][]int{ + []int{base + int(r>>15), defaultSecondary, defaultTertiary, int(r)}, + []int{int(r&0x7FFF) | 0x8000, 0, 0, int(r)}, + } +} + +func pCE(p int) [][]int { + return [][]int{[]int{p, defaultSecondary, defaultTertiary, 0}} +} + +func pqCE(p, q int) [][]int { + return [][]int{[]int{p, defaultSecondary, defaultTertiary, q}} +} + +func ptCE(p, t int) [][]int { + return [][]int{[]int{p, defaultSecondary, t, 0}} +} + +func sCE(s int) [][]int { + return [][]int{[]int{0, s, defaultTertiary, 0}} +} + +func stCE(s, t int) [][]int { + return [][]int{[]int{0, s, t, 0}} +} + +// ducetElem is used to define test data that is used to generate a table. +type ducetElem struct { + str string + ces [][]int +} + +func newBuilder(t *testing.T, ducet []ducetElem) *Builder { + b := NewBuilder() + for _, e := range ducet { + if err := b.Add([]rune(e.str), e.ces); err != nil { + t.Errorf(err.Error()) + } + } + b.t = &table{} + return b +} + +type convertTest struct { + in, out [][]int + err bool +} + +var convLargeTests = []convertTest{ + {pCE(0xFB39), pCE(0xFB39), false}, + {cjk(0x2F9B2), pqCE(0x7F4F2, 0x2F9B2), false}, + {pCE(0xFB40), pCE(0), true}, + {append(pCE(0xFB40), pCE(0)[0]), pCE(0), true}, + {pCE(0xFFFE), pCE(illegalOffset), false}, + {pCE(0xFFFF), pCE(illegalOffset + 1), false}, +} + +func TestConvertLarge(t *testing.T) { + for i, tt := range convLargeTests { + e := &entry{elems: tt.in} + elems, err := convertLargeWeights(e.elems) + if tt.err { + if err == nil { + t.Errorf("%d: expected error; none found", i) + } + continue + } else if err != nil { + t.Errorf("%d: unexpected error: %v", i, err) + } + if !equalCEArrays(elems, tt.out) { + t.Errorf("%d: conversion was %x; want %x", i, elems, tt.out) + } + } +} + +// Collation element table for simplify tests. +var simplifyTest = []ducetElem{ + {"\u0300", sCE(30)}, // grave + {"\u030C", sCE(40)}, // caron + {"A", ptCE(100, 8)}, + {"D", ptCE(104, 8)}, + {"E", ptCE(105, 8)}, + {"I", ptCE(110, 8)}, + {"z", ptCE(130, 8)}, + {"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0])}, + {"\u05B7", sCE(80)}, + {"\u00C0", append(ptCE(100, 8), sCE(30)...)}, // A with grave, can be removed + {"\u00C8", append(ptCE(105, 8), sCE(30)...)}, // E with grave + {"\uFB1F", append(ptCE(200, 4), ptCE(200, 4)[0], sCE(80)[0])}, // eliminated by NFD + {"\u00C8\u0302", ptCE(106, 8)}, // block previous from simplifying + {"\u01C5", append(ptCE(104, 9), ptCE(130, 4)[0], stCE(40, maxTertiary)[0])}, // eliminated by NFKD + // no removal: tertiary value of third element is not maxTertiary + {"\u2162", append(ptCE(110, 9), ptCE(110, 4)[0], ptCE(110, 8)[0])}, +} + +var genColTests = []ducetElem{ + {"\uFA70", pqCE(0x1F5B0, 0xFA70)}, + {"A\u0300", append(ptCE(100, 8), sCE(30)...)}, + {"A\u0300\uFA70", append(ptCE(100, 8), sCE(30)[0], pqCE(0x1F5B0, 0xFA70)[0])}, + {"A\u0300A\u0300", append(ptCE(100, 8), sCE(30)[0], ptCE(100, 8)[0], sCE(30)[0])}, +} + +func TestGenColElems(t *testing.T) { + b := newBuilder(t, simplifyTest[:5]) + + for i, tt := range genColTests { + res := b.genColElems(tt.str) + if !equalCEArrays(tt.ces, res) { + t.Errorf("%d: result %X; want %X", i, res, tt.ces) + } + } +} + +type strArray []string + +func (sa strArray) contains(s string) bool { + for _, e := range sa { + if e == s { + return true + } + } + return false +} + +var simplifyRemoved = strArray{"\u00C0", "\uFB1F"} +var simplifyMarked = strArray{"\u01C5"} + +func TestSimplify(t *testing.T) { + b := newBuilder(t, simplifyTest) + b.simplify() + + k := 0 + for i, tt := range simplifyTest { + if simplifyRemoved.contains(tt.str) { + continue + } + e := b.entry[k] + k++ + if e.str != tt.str || !equalCEArrays(e.elems, tt.ces) { + t.Errorf("%d: found element %s -> %X; want %s -> %X", i, e.str, e.elems, tt.str, tt.ces) + break + } + } + k = 0 + for i, e := range b.entry { + gold := simplifyMarked.contains(e.str) + if gold { + k++ + } + if gold != e.decompose { + t.Errorf("%d: %s has decompose %v; want %v", i, e.str, e.decompose, gold) + } + } + if k != len(simplifyMarked) { + t.Errorf(" an entry that should be marked as decompose was deleted") + } +} + +var expandTest = []ducetElem{ + {"\u00C0", append(ptCE(100, 8), sCE(30)...)}, + {"\u00C8", append(ptCE(105, 8), sCE(30)...)}, + {"\u00C9", append(ptCE(105, 8), sCE(30)...)}, // identical expansion + {"\u05F2", append(ptCE(200, 4), ptCE(200, 4)[0], ptCE(200, 4)[0])}, +} + +func TestExpand(t *testing.T) { + const ( + totalExpansions = 3 + totalElements = 2 + 2 + 3 + totalExpansions + ) + b := newBuilder(t, expandTest) + b.processExpansions() + + for i, tt := range expandTest { + e := b.entry[i] + exp := b.t.expandElem[e.expansionIndex:] + if int(exp[0]) != len(tt.ces) { + t.Errorf("%U: len(expansion)==%d; want %d", []rune(tt.str)[0], exp[0], len(tt.ces)) + } + exp = exp[1:] + for j, w := range tt.ces { + if ce, _ := makeCE(w); exp[j] != ce { + t.Errorf("%U: element %d is %X; want %X", []rune(tt.str)[0], j, exp[j], ce) + } + } + } + // Verify uniquing. + if len(b.t.expandElem) != totalElements { + t.Errorf("len(expandElem)==%d; want %d", len(b.t.expandElem), totalElements) + } +} + +var contractTest = []ducetElem{ + {"abc", pCE(102)}, + {"abd", pCE(103)}, + {"a", pCE(100)}, + {"ab", pCE(101)}, + {"ac", pCE(104)}, + {"bcd", pCE(202)}, + {"b", pCE(200)}, + {"bc", pCE(201)}, + {"bd", pCE(203)}, + // shares suffixes with a* + {"Ab", pCE(301)}, + {"A", pCE(300)}, + {"Ac", pCE(304)}, + {"Abc", pCE(302)}, + {"Abd", pCE(303)}, + // starter to be ignored + {"z", pCE(1000)}, +} + +func TestContract(t *testing.T) { + const ( + totalElements = 5 + 5 + 4 + ) + b := newBuilder(t, contractTest) + b.processContractions() + + indexMap := make(map[int]bool) + handleMap := make(map[rune]*entry) + for _, e := range b.entry { + if e.contractionHandle.n > 0 { + handleMap[e.runes[0]] = e + indexMap[e.contractionHandle.index] = true + } + } + // Verify uniquing. + if len(indexMap) != 2 { + t.Errorf("number of tries is %d; want %d", len(indexMap), 2) + } + for _, tt := range contractTest { + e, ok := handleMap[[]rune(tt.str)[0]] + if !ok { + continue + } + str := tt.str[1:] + offset, n := b.t.contractTries.lookup(e.contractionHandle, []byte(str)) + if len(str) != n { + t.Errorf("%s: bytes consumed==%d; want %d", tt.str, n, len(str)) + } + ce := b.t.contractElem[offset+e.contractionIndex] + if want, _ := makeCE(tt.ces[0]); want != ce { + t.Errorf("%s: element %X; want %X", tt.str, ce, want) + } + } + if len(b.t.contractElem) != totalElements { + t.Errorf("len(expandElem)==%d; want %d", len(b.t.contractElem), totalElements) + } +} -- 2.48.1