--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package build
+
+import (
+ "fmt"
+ "unicode"
+)
+
+const (
+ defaultSecondary = 0x20
+ defaultTertiary = 0x2
+ maxTertiary = 0x1F
+)
+
+// A collation element is represented as an uint32.
+// In the typical case, a rune maps to a single collation element. If a rune
+// can be the start of a contraction or expands into multiple collation elements,
+// then the collation element that is associated with a rune will have a special
+// form to represent such m to n mappings. Such special collation elements
+// have a value >= 0x80000000.
+
+// For normal collation elements, we assume that a collation element either has
+// a primary or non-default secondary value, not both.
+// Collation elements with a primary value are of the form
+// 010ppppp pppppppp pppppppp tttttttt, where
+// - p* is primary collation value
+// - t* is the tertiary collation value
+// Collation elements with a secondary value are of the form
+// 00000000 ssssssss ssssssss tttttttt, where
+// - s* is the secondary collation value
+// - t* is the tertiary collation value
+const (
+ maxPrimaryBits = 21
+ maxSecondaryBits = 16
+ maxTertiaryBits = 8
+
+ isPrimary = 0x40000000
+)
+
+func makeCE(weights []int) (uint32, error) {
+ if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
+ return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
+ }
+ if w := weights[1]; w >= 1<<maxSecondaryBits || w < 0 {
+ return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
+ }
+ if w := weights[2]; w >= 1<<maxTertiaryBits || w < 0 {
+ return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %d >= %d", w, 1<<maxTertiaryBits)
+ }
+ ce := uint32(0)
+ if weights[0] != 0 {
+ // primary weight form
+ if weights[1] != defaultSecondary {
+ return 0, fmt.Errorf("makeCE: non-default secondary weight for non-zero primary: %X", weights)
+ }
+ ce = uint32(weights[0]<<maxTertiaryBits + weights[2])
+ ce |= isPrimary
+ } else {
+ // secondary weight form
+ ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
+ }
+ return ce, nil
+}
+
+// For contractions, collation elements are of the form
+// 10bbbbbb bbbbbbbb iiiiiiii iiinnnnn, where
+// - n* is the size of the first node in the contraction trie.
+// - i* is the index of the first node in the contraction trie.
+// - b* is the offset into the contraction collation element table.
+// See contract.go for details on the contraction trie.
+const (
+ contractID = 0x80000000
+ maxNBits = 5
+ maxTrieIndexBits = 11
+ maxContractOffsetBits = 14
+)
+
+func makeContractIndex(h ctHandle, offset int) (uint32, error) {
+ if h.n >= 1<<maxNBits {
+ return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
+ }
+ if h.index >= 1<<maxTrieIndexBits {
+ return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
+ }
+ if offset >= 1<<maxContractOffsetBits {
+ return 0, fmt.Errorf("offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
+ }
+ ce := uint32(contractID)
+ ce += uint32(offset << (maxTrieIndexBits + maxNBits))
+ ce += uint32(h.index << maxNBits)
+ ce += uint32(h.n)
+ return ce, nil
+}
+
+// For expansions, collation elements are of the form
+// 110bbbbb bbbbbbbb bbbbbbbb bbbbbbbb,
+// where b* is the index into the expansion sequence table.
+const (
+ expandID = 0xC0000000
+ maxExpandIndexBits = 29
+)
+
+func makeExpandIndex(index int) (uint32, error) {
+ if index >= 1<<maxExpandIndexBits {
+ return 0, fmt.Errorf("index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
+ }
+ return expandID + uint32(index), nil
+}
+
+// Each list of collation elements corresponding to an expansion starts with
+// a header indicating the length of the sequence.
+func makeExpansionHeader(n int) (uint32, error) {
+ return uint32(n), nil
+}
+
+// Some runes can be expanded using NFKD decomposition. Instead of storing the full
+// sequence of collation elements, we decompose the rune and lookup the collation
+// elements for each rune in the decomposition and modify the tertiary weights.
+// The collation element, in this case, is of the form
+// 11100000 00000000 wwwwwwww vvvvvvvv, where
+// - v* is the replacement tertiary weight for the first rune,
+// - w* is the replacement tertiary weight for the second rune,
+// Tertiary weights of subsequent runes should be replaced with maxTertiary.
+// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
+const (
+ decompID = 0xE0000000
+)
+
+func makeDecompose(t1, t2 int) (uint32, error) {
+ if t1 >= 256 || t1 < 0 {
+ return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
+ }
+ if t2 >= 256 || t2 < 0 {
+ return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
+ }
+ return uint32(t2<<8+t1) + decompID, nil
+}
+
+const (
+ // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
+ minUnified rune = 0x4E00
+ maxUnified = 0x9FFF
+ minCompatibility = 0xF900
+ maxCompatibility = 0xFAFF
+ minRare = 0x3400
+ maxRare = 0x4DBF
+)
+const (
+ commonUnifiedOffset = 0xFB40
+ rareUnifiedOffset = 0x1FB40
+ otherOffset = 0x4FB40
+ illegalOffset = otherOffset + unicode.MaxRune
+ maxPrimary = illegalOffset + 2 // there are 2 illegal values.
+)
+
+// implicitPrimary returns the primary weight for the a rune
+// for which there is no entry for the rune in the collation table.
+// We take a different approach from the one specified in
+// http://unicode.org/reports/tr10/#Implicit_Weights,
+// but preserve the resulting relative ordering of the runes.
+func implicitPrimary(r rune) int {
+
+ if r >= minUnified && r <= maxUnified {
+ // The most common case for CJK.
+ return int(r) + commonUnifiedOffset
+ }
+ if r >= minCompatibility && r <= maxCompatibility {
+ // This will never hit as long as we don't remove the characters
+ // that would match from the table.
+ return int(r) + commonUnifiedOffset
+ }
+ if unicode.Is(unicode.Unified_Ideograph, r) {
+ return int(r) + rareUnifiedOffset
+ }
+ return int(r) + otherOffset
+}
--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package build
+
+import "testing"
+
+type ceTest struct {
+ f func(in []int) (uint32, error)
+ arg []int
+ val uint32
+}
+
+func normalCE(in []int) (ce uint32, err error) {
+ return makeCE(in)
+}
+
+func expandCE(in []int) (ce uint32, err error) {
+ return makeExpandIndex(in[0])
+}
+
+func contractCE(in []int) (ce uint32, err error) {
+ return makeContractIndex(ctHandle{in[0], in[1]}, in[2])
+}
+
+func decompCE(in []int) (ce uint32, err error) {
+ return makeDecompose(in[0], in[1])
+}
+
+var ceTests = []ceTest{
+ {normalCE, []int{0, 0, 0}, 000},
+ {normalCE, []int{0, 30, 3}, 0x1E03},
+ {normalCE, []int{100, defaultSecondary, 3}, 0x40006403},
+ {normalCE, []int{100, 0, 3}, 0xFFFF}, // non-ignorable primary with non-default secondary
+ {normalCE, []int{100, 1, 3}, 0xFFFF},
+ {normalCE, []int{1 << maxPrimaryBits, defaultSecondary, 0}, 0xFFFF},
+ {normalCE, []int{0, 1 << maxSecondaryBits, 0}, 0xFFFF},
+ {normalCE, []int{100, defaultSecondary, 1 << maxTertiaryBits}, 0xFFFF},
+
+ {contractCE, []int{0, 0, 0}, 0x80000000},
+ {contractCE, []int{1, 1, 1}, 0x80010021},
+ {contractCE, []int{1, (1 << maxNBits) - 1, 1}, 0x8001003F},
+ {contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}, 0x8001FFE1},
+ {contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}, 0xBFFF0021},
+ {contractCE, []int{1, (1 << maxNBits), 1}, 0xFFFF},
+ {contractCE, []int{(1 << maxTrieIndexBits), 1, 1}, 0xFFFF},
+ {contractCE, []int{1, (1 << maxContractOffsetBits), 1}, 0xFFFF},
+
+ {expandCE, []int{0}, 0xC0000000},
+ {expandCE, []int{5}, 0xC0000005},
+ {expandCE, []int{(1 << maxExpandIndexBits) - 1}, 0xDFFFFFFF},
+ {expandCE, []int{1 << maxExpandIndexBits}, 0xFFFF},
+
+ {decompCE, []int{0, 0}, 0xE0000000},
+ {decompCE, []int{1, 1}, 0xE0000101},
+ {decompCE, []int{0x1F, 0x1F}, 0xE0001F1F},
+ {decompCE, []int{256, 0x1F}, 0xFFFF},
+ {decompCE, []int{0x1F, 256}, 0xFFFF},
+}
+
+func TestColElem(t *testing.T) {
+ for i, tt := range ceTests {
+ in := make([]int, len(tt.arg))
+ copy(in, tt.arg)
+ ce, err := tt.f(in)
+ if tt.val == 0xFFFF {
+ if err == nil {
+ t.Errorf("%d: expected error for args %x", i, tt.arg)
+ }
+ continue
+ }
+ if err != nil {
+ t.Errorf("%d: unexpected error: %v", i, err.Error())
+ }
+ if ce != tt.val {
+ t.Errorf("%d: colElem=%X; want %X", i, ce, tt.val)
+ }
+ }
+}
--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package collate
+
+import (
+ "unicode"
+)
+
+// weights holds the decoded weights per collation level.
+type weights struct {
+ primary uint32
+ secondary uint16
+ tertiary uint8
+ // TODO: compute quaternary on the fly or compress this value into 8 bits
+ // such that weights fit within 64bit.
+ quaternary uint32
+}
+
+const (
+ defaultSecondary = 0x20
+ defaultTertiary = 0x2
+ maxTertiary = 0x1F
+)
+
+// colElem is a representation of a collation element.
+// In the typical case, a rune maps to a single collation element. If a rune
+// can be the start of a contraction or expands into multiple collation elements,
+// then the colElem that is associated with a rune will have a special form to represent
+// such m to n mappings. Such special colElems have a value >= 0x80000000.
+type colElem uint32
+
+const (
+ maxCE colElem = 0x7FFFFFFF
+ minContract = 0x80000000
+ maxContract = 0xBFFFFFFF
+ minExpand = 0xC0000000
+ maxExpand = 0xDFFFFFFF
+ minDecomp = 0xE0000000
+)
+
+type ceType int
+
+const (
+ ceNormal ceType = iota // ceNormal includes implicits (ce == 0)
+ ceContractionIndex // rune can be a start of a contraction
+ ceExpansionIndex // rune expands into a sequence of collation elements
+ ceDecompose // rune expands using NFKC decomposition
+)
+
+func (ce colElem) ctype() ceType {
+ if ce <= maxCE {
+ return ceNormal
+ }
+ if ce <= maxContract {
+ return ceContractionIndex
+ } else {
+ if ce <= maxExpand {
+ return ceExpansionIndex
+ }
+ return ceDecompose
+ }
+ panic("should not reach here")
+ return ceType(-1)
+}
+
+// For normal collation elements, we assume that a collation element either has
+// a primary or non-default secondary value, not both.
+// Collation elements with a primary value are of the form
+// 010ppppp pppppppp pppppppp tttttttt, where
+// - p* is primary collation value
+// - t* is the tertiary collation value
+// Collation elements with a secondary value are of the form
+// 00000000 ssssssss ssssssss tttttttt, where
+// - s* is the secondary collation value
+// - t* is the tertiary collation value
+func splitCE(ce colElem) weights {
+ w := weights{}
+ w.tertiary = uint8(ce)
+ if ce&0x40000000 != 0 {
+ // primary weight form
+ w.primary = uint32((ce >> 8) & 0x1FFFFF)
+ w.secondary = defaultSecondary
+ } else {
+ // secondary weight form
+ w.secondary = uint16(ce >> 8)
+ }
+ return w
+}
+
+// For contractions, colElems are of the form 10bbbbbb bbbbbbbb hhhhhhhh hhhhhhhh, where
+// - h* is the compTrieHandle.
+// - b* is the offset into the contraction collation element table.
+// See contract.go for details on the contraction trie.
+const (
+ maxNBits = 5
+ maxTrieIndexBits = 11
+ maxContractOffsetBits = 14
+)
+
+func splitContractIndex(ce colElem) (index, n, offset int) {
+ h := uint16(ce)
+ return int(h >> maxNBits), int(h & (1<<maxNBits - 1)), int(ce>>16) & (1<<maxContractOffsetBits - 1)
+}
+
+// For expansions, colElems are of the form 110bbbbb bbbbbbbb bbbbbbbb bbbbbbbb,
+// where b* is the index into the expansion sequence table.
+const (
+ maxExpandIndexBits = 29
+)
+
+func splitExpandIndex(ce colElem) (index int) {
+ index = int(ce) & (1<<maxExpandIndexBits - 1)
+ return
+}
+
+// Some runes can be expanded using NFKD decomposition. Instead of storing the full
+// sequence of collation elements, we decompose the rune and lookup the collation
+// elements for each rune in the decomposition and modify the tertiary weights.
+// The colElem, in this case, is of the form 11100000 00000000 wwwwwwww vvvvvvvv, where
+// - v* is the replacement tertiary weight for the first rune,
+// - w* is the replacement tertiary weight for the second rune,
+// Tertiary weights of subsequent runes should be replaced with maxTertiary.
+// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
+const (
+ decompID = 0xE0000000
+)
+
+func splitDecompose(ce colElem) (t1, t2 uint8) {
+ return uint8(ce), uint8(ce >> 8)
+}
+
+const (
+ // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
+ minUnified rune = 0x4E00
+ maxUnified = 0x9FFF
+ minCompatibility = 0xF900
+ maxCompatibility = 0xFAFF
+ minRare = 0x3400
+ maxRare = 0x4DBF
+)
+const (
+ commonUnifiedOffset = 0xFB40
+ rareUnifiedOffset = 0x1FB40
+ otherOffset = 0x4FB40
+ maxPrimary = otherOffset + unicode.MaxRune
+)
+
+// implicitPrimary returns the primary weight for the a rune
+// for which there is no entry for the rune in the collation table.
+// We take a different approach from the one specified in
+// http://unicode.org/reports/tr10/#Implicit_Weights,
+// but preserve the resulting relative ordering of the runes.
+func implicitPrimary(r rune) int {
+
+ if r >= minUnified && r <= maxUnified {
+ // The most common case for CJK.
+ return int(r) + commonUnifiedOffset
+ }
+ if r >= minCompatibility && r <= maxCompatibility {
+ // This will never hit as long as we don't remove the characters
+ // that would match from the table.
+ return int(r) + commonUnifiedOffset
+ }
+ if unicode.Is(unicode.Unified_Ideograph, r) {
+ return int(r) + rareUnifiedOffset
+ }
+ return int(r) + otherOffset
+}
--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package collate
+
+import (
+ "testing"
+ "unicode"
+)
+
+type ceTest struct {
+ f func(inout []int) (colElem, ceType)
+ arg []int
+}
+
+// The make* funcs are simplified versions of the functions in build/colelem.go
+func makeCE(weights []int) colElem {
+ const (
+ maxPrimaryBits = 21
+ maxSecondaryBits = 16
+ maxTertiaryBits = 8
+ isPrimary = 0x40000000
+ )
+ var ce colElem
+ if weights[0] != 0 {
+ ce = colElem(weights[0]<<maxTertiaryBits + weights[2])
+ ce |= isPrimary
+ } else {
+ ce = colElem(weights[1]<<maxTertiaryBits + weights[2])
+ }
+ return ce
+}
+
+func makeContractIndex(index, n, offset int) colElem {
+ const (
+ contractID = 0x80000000
+ maxNBits = 5
+ maxTrieIndexBits = 11
+ )
+ ce := colElem(contractID)
+ ce += colElem(offset << (maxTrieIndexBits + maxNBits))
+ ce += colElem(index << maxNBits)
+ ce += colElem(n)
+ return ce
+}
+
+func makeExpandIndex(index int) colElem {
+ const expandID = 0xC0000000
+ return expandID + colElem(index)
+}
+
+func makeDecompose(t1, t2 int) colElem {
+ const decompID = 0xE0000000
+ return colElem(t2<<8+t1) + decompID
+}
+
+func normalCE(inout []int) (ce colElem, t ceType) {
+ w := splitCE(makeCE(inout))
+ inout[0] = int(w.primary)
+ inout[1] = int(w.secondary)
+ inout[2] = int(w.tertiary)
+ return ce, ceNormal
+}
+
+func expandCE(inout []int) (ce colElem, t ceType) {
+ ce = makeExpandIndex(inout[0])
+ inout[0] = splitExpandIndex(ce)
+ return ce, ceExpansionIndex
+}
+
+func contractCE(inout []int) (ce colElem, t ceType) {
+ ce = makeContractIndex(inout[0], inout[1], inout[2])
+ i, n, o := splitContractIndex(ce)
+ inout[0], inout[1], inout[2] = i, n, o
+ return ce, ceContractionIndex
+}
+
+func decompCE(inout []int) (ce colElem, t ceType) {
+ ce = makeDecompose(inout[0], inout[1])
+ t1, t2 := splitDecompose(ce)
+ inout[0], inout[1] = int(t1), int(t2)
+ return ce, ceDecompose
+}
+
+const (
+ maxPrimaryBits = 21
+ maxSecondaryBits = 16
+ maxTertiaryBits = 8
+)
+
+var ceTests = []ceTest{
+ {normalCE, []int{0, 0, 0}},
+ {normalCE, []int{0, 30, 3}},
+ {normalCE, []int{100, defaultSecondary, 3}},
+
+ {contractCE, []int{0, 0, 0}},
+ {contractCE, []int{1, 1, 1}},
+ {contractCE, []int{1, (1 << maxNBits) - 1, 1}},
+ {contractCE, []int{(1 << maxTrieIndexBits) - 1, 1, 1}},
+ {contractCE, []int{1, 1, (1 << maxContractOffsetBits) - 1}},
+
+ {expandCE, []int{0}},
+ {expandCE, []int{5}},
+ {expandCE, []int{(1 << maxExpandIndexBits) - 1}},
+
+ {decompCE, []int{0, 0}},
+ {decompCE, []int{1, 1}},
+ {decompCE, []int{0x1F, 0x1F}},
+}
+
+func TestColElem(t *testing.T) {
+ for i, tt := range ceTests {
+ inout := make([]int, len(tt.arg))
+ copy(inout, tt.arg)
+ ce, typ := tt.f(inout)
+ if ce.ctype() != typ {
+ t.Errorf("%d: type is %d; want %d", i, ce.ctype(), typ)
+ }
+ for j, a := range tt.arg {
+ if inout[j] != a {
+ t.Errorf("%d: argument %d is %d; want %d", i, j, inout[j], a)
+ }
+ }
+ }
+}
+
+type implicitTest struct {
+ r rune
+ p int
+}
+
+var implicitTests = []implicitTest{
+ {0x33FF, 0x52F3F},
+ {0x3400, 0x22F40},
+ {0x4DC0, 0x54900},
+ {0x4DFF, 0x5493F},
+ {0x4E00, 0x14940},
+ {0x9FCB, 0x19B0B},
+ {0xA000, 0x59B40},
+ {0xF8FF, 0x5F43F},
+ {0xF900, 0x1F440},
+ {0xFA23, 0x1F563},
+ {0xFAFF, 0x1F63F},
+ {0xFB00, 0x5F640},
+ {0x20000, 0x3FB40},
+ {0x2B81C, 0x4B35C},
+ {unicode.MaxRune, 0x15FB3F}, // maximum primary value
+}
+
+func TestImplicit(t *testing.T) {
+ for _, tt := range implicitTests {
+ if p := implicitPrimary(tt.r); p != tt.p {
+ t.Errorf("%U: was %X; want %X", tt.r, p, tt.p)
+ }
+ }
+}