From: Marcel van Lohuizen Date: Thu, 6 Sep 2012 04:16:02 +0000 (+0900) Subject: exp/locale/collate/build: moved some of the code to the appropriate file, as X-Git-Tag: go1.1rc2~2521 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=f0a31b5fc2e64ad1c597a5efc72749ab77058b87;p=gostls13.git exp/locale/collate/build: moved some of the code to the appropriate file, as promised in CL 13985. R=r CC=golang-dev https://golang.org/cl/6503071 --- diff --git a/src/pkg/exp/locale/collate/build/builder.go b/src/pkg/exp/locale/collate/build/builder.go index 3f3829190c..6372a8ef89 100644 --- a/src/pkg/exp/locale/collate/build/builder.go +++ b/src/pkg/exp/locale/collate/build/builder.go @@ -29,49 +29,6 @@ import ( // could analyze and detect when using a context makes sense, there is no // need to expose this construct in the API. -// entry is used to keep track of a single entry in the collation element table -// during building. Examples of entries can be found in the Default Unicode -// Collation Element Table. -// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt. -type entry struct { - runes []rune - elems [][]int // the collation elements for runes - str string // same as string(runes) - - // prev, next, and level are used to keep track of tailorings. - prev, next *entry - level collate.Level // next differs at this level - - decompose bool // can use NFKD decomposition to generate elems - exclude bool // do not include in table - logical logicalAnchor - - expansionIndex int // used to store index into expansion table - contractionHandle ctHandle - contractionIndex int // index into contraction elements -} - -func (e *entry) String() string { - return fmt.Sprintf("%X -> %X (ch:%x; ci:%d, ei:%d)", - e.runes, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex) -} - -func (e *entry) skip() bool { - return e.contraction() -} - -func (e *entry) expansion() bool { - return !e.decompose && len(e.elems) > 1 -} - -func (e *entry) contraction() bool { - return len(e.runes) > 1 -} - -func (e *entry) contractionStarter() bool { - return e.contractionHandle.n != 0 -} - // A Builder builds a root collation table. The user must specify the // collation elements for each entry. A common use will be to base the weights // on those specified in the allkeys* file as provided by the UCA or CLDR. @@ -231,52 +188,6 @@ func (t *Tailoring) Insert(level collate.Level, str, extend string) error { return nil } -func (b *Builder) baseColElem(e *entry) uint32 { - ce := uint32(0) - var err error - switch { - case e.expansion(): - ce, err = makeExpandIndex(e.expansionIndex) - default: - if e.decompose { - log.Fatal("decompose should be handled elsewhere") - } - ce, err = makeCE(e.elems[0]) - } - if err != nil { - b.error(fmt.Errorf("%s: %X -> %X", err, e.runes, e.elems)) - } - return ce -} - -func (b *Builder) colElem(e *entry) uint32 { - if e.skip() { - log.Fatal("cannot build colElem for entry that should be skipped") - } - ce := uint32(0) - var err error - switch { - case e.decompose: - t1 := e.elems[0][2] - t2 := 0 - if len(e.elems) > 1 { - t2 = e.elems[1][2] - } - ce, err = makeDecompose(t1, t2) - case e.contractionStarter(): - ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex) - default: - if len(e.runes) > 1 { - log.Fatal("colElem: contractions are handled in contraction trie") - } - ce = b.baseColElem(e) - } - if err != nil { - b.error(err) - } - return ce -} - func (b *Builder) error(e error) { if e != nil { b.err = e @@ -352,30 +263,6 @@ func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool { return true } -func equalCE(a, b []int) bool { - if len(a) != len(b) { - return false - } - for i := 0; i < 3; i++ { - if b[i] != a[i] { - return false - } - } - return true -} - -func equalCEArrays(a, b [][]int) bool { - if len(a) != len(b) { - return false - } - for i := range a { - if !equalCE(a[i], b[i]) { - return false - } - } - return true -} - func (b *Builder) simplify() { // Runes that are a starter of a contraction should not be removed. // (To date, there is only Kannada character 0CCA.) @@ -412,62 +299,6 @@ func (b *Builder) simplify() { } } -// convertLargeWeights converts collation elements with large -// primaries (either double primaries or for illegal runes) -// to our own representation. -// A CJK character C is represented in the DUCET as -// [.FBxx.0020.0002.C][.BBBB.0000.0000.C] -// We will rewrite these characters to a single CE. -// We assume the CJK values start at 0x8000. -// See http://unicode.org/reports/tr10/#Implicit_Weights -func convertLargeWeights(elems [][]int) (res [][]int, err error) { - const ( - cjkPrimaryStart = 0xFB40 - rarePrimaryStart = 0xFB80 - otherPrimaryStart = 0xFBC0 - illegalPrimary = 0xFFFE - highBitsMask = 0x3F - lowBitsMask = 0x7FFF - lowBitsFlag = 0x8000 - shiftBits = 15 - ) - for i := 0; i < len(elems); i++ { - ce := elems[i] - p := ce[0] - if p < cjkPrimaryStart { - continue - } - if p > 0xFFFF { - return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p) - } - if p >= illegalPrimary { - ce[0] = illegalOffset + p - illegalPrimary - } else { - if i+1 >= len(elems) { - return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) - } - if elems[i+1][0]&lowBitsFlag == 0 { - return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) - } - np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask - switch { - case p < rarePrimaryStart: - np += commonUnifiedOffset - case p < otherPrimaryStart: - np += rareUnifiedOffset - default: - p += otherOffset - } - ce[0] = np - for j := i + 1; j+1 < len(elems); j++ { - elems[j] = elems[j+1] - } - elems = elems[:len(elems)-1] - } - } - return elems, nil -} - // appendExpansion converts the given collation sequence to // collation elements and adds them to the expansion table. // It returns an index to the expansion table. @@ -586,7 +417,9 @@ func (b *Builder) processContractions() { es[0].contractionHandle = handle // Add collation elements for contractions. for _, e := range es { - t.contractElem = append(t.contractElem, b.baseColElem(e)) + ce, err := e.encodeBase() + b.error(err) + t.contractElem = append(t.contractElem, ce) } } } @@ -596,7 +429,8 @@ func (b *Builder) buildTrie() { o := b.root for e := o.front(); e != nil; e, _ = e.nextIndexed() { if !e.skip() { - ce := b.colElem(e) + ce, err := e.encode() + b.error(err) t.insert(e.runes[0], ce) } } diff --git a/src/pkg/exp/locale/collate/build/colelem.go b/src/pkg/exp/locale/collate/build/colelem.go index 3f0f0c6c28..343aa740a7 100644 --- a/src/pkg/exp/locale/collate/build/colelem.go +++ b/src/pkg/exp/locale/collate/build/colelem.go @@ -199,6 +199,62 @@ func implicitPrimary(r rune) int { return int(r) + otherOffset } +// convertLargeWeights converts collation elements with large +// primaries (either double primaries or for illegal runes) +// to our own representation. +// A CJK character C is represented in the DUCET as +// [.FBxx.0020.0002.C][.BBBB.0000.0000.C] +// We will rewrite these characters to a single CE. +// We assume the CJK values start at 0x8000. +// See http://unicode.org/reports/tr10/#Implicit_Weights +func convertLargeWeights(elems [][]int) (res [][]int, err error) { + const ( + cjkPrimaryStart = 0xFB40 + rarePrimaryStart = 0xFB80 + otherPrimaryStart = 0xFBC0 + illegalPrimary = 0xFFFE + highBitsMask = 0x3F + lowBitsMask = 0x7FFF + lowBitsFlag = 0x8000 + shiftBits = 15 + ) + for i := 0; i < len(elems); i++ { + ce := elems[i] + p := ce[0] + if p < cjkPrimaryStart { + continue + } + if p > 0xFFFF { + return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p) + } + if p >= illegalPrimary { + ce[0] = illegalOffset + p - illegalPrimary + } else { + if i+1 >= len(elems) { + return elems, fmt.Errorf("second part of double primary weight missing: %v", elems) + } + if elems[i+1][0]&lowBitsFlag == 0 { + return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems) + } + np := ((p & highBitsMask) << shiftBits) + elems[i+1][0]&lowBitsMask + switch { + case p < rarePrimaryStart: + np += commonUnifiedOffset + case p < otherPrimaryStart: + np += rareUnifiedOffset + default: + p += otherOffset + } + ce[0] = np + for j := i + 1; j+1 < len(elems); j++ { + elems[j] = elems[j+1] + } + elems = elems[:len(elems)-1] + } + } + return elems, nil +} + // nextWeight computes the first possible collation weights following elems // for the given level. func nextWeight(level collate.Level, elems [][]int) [][]int { @@ -247,3 +303,27 @@ func compareWeights(a, b [][]int) (result int, level collate.Level) { } return 0, collate.Identity } + +func equalCE(a, b []int) bool { + if len(a) != len(b) { + return false + } + for i := 0; i < 3; i++ { + if b[i] != a[i] { + return false + } + } + return true +} + +func equalCEArrays(a, b [][]int) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if !equalCE(a[i], b[i]) { + return false + } + } + return true +} diff --git a/src/pkg/exp/locale/collate/build/order.go b/src/pkg/exp/locale/collate/build/order.go index 014141c6d6..f09881608d 100644 --- a/src/pkg/exp/locale/collate/build/order.go +++ b/src/pkg/exp/locale/collate/build/order.go @@ -21,7 +21,48 @@ const ( lastAnchor = 1 ) -// TODO: move type entry from builder.go to this file. +// entry is used to keep track of a single entry in the collation element table +// during building. Examples of entries can be found in the Default Unicode +// Collation Element Table. +// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt. +type entry struct { + runes []rune + elems [][]int // the collation elements for runes + str string // same as string(runes) + + // prev, next, and level are used to keep track of tailorings. + prev, next *entry + level collate.Level // next differs at this level + + decompose bool // can use NFKD decomposition to generate elems + exclude bool // do not include in table + logical logicalAnchor + + expansionIndex int // used to store index into expansion table + contractionHandle ctHandle + contractionIndex int // index into contraction elements +} + +func (e *entry) String() string { + return fmt.Sprintf("%X -> %X (ch:%x; ci:%d, ei:%d)", + e.runes, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex) +} + +func (e *entry) skip() bool { + return e.contraction() +} + +func (e *entry) expansion() bool { + return !e.decompose && len(e.elems) > 1 +} + +func (e *entry) contraction() bool { + return len(e.runes) > 1 +} + +func (e *entry) contractionStarter() bool { + return e.contractionHandle.n != 0 +} // nextIndexed gets the next entry that needs to be stored in the table. // It returns the entry and the collation level at which the next entry differs @@ -72,6 +113,42 @@ func (e *entry) insertAfter(n *entry) { e.next = n } +func (e *entry) encodeBase() (ce uint32, err error) { + switch { + case e.expansion(): + ce, err = makeExpandIndex(e.expansionIndex) + default: + if e.decompose { + log.Fatal("decompose should be handled elsewhere") + } + ce, err = makeCE(e.elems[0]) + } + return +} + +func (e *entry) encode() (ce uint32, err error) { + if e.skip() { + log.Fatal("cannot build colElem for entry that should be skipped") + } + switch { + case e.decompose: + t1 := e.elems[0][2] + t2 := 0 + if len(e.elems) > 1 { + t2 = e.elems[1][2] + } + ce, err = makeDecompose(t1, t2) + case e.contractionStarter(): + ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex) + default: + if len(e.runes) > 1 { + log.Fatal("colElem: contractions are handled in contraction trie") + } + ce, err = e.encodeBase() + } + return +} + // entryLess returns true if a sorts before b and false otherwise. func entryLess(a, b *entry) bool { if res, _ := compareWeights(a.elems, b.elems); res != 0 {