// the UTF-8 characters in order. Only the rune array is maintained in sorted
// order. flush writes the resulting segment to a byte array.
type reorderBuffer struct {
- rune [maxBufferSize]runeInfo // Per character info.
- byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
- nrune int // Number of runeInfos.
- nbyte uint8 // Number or bytes.
+ rune [maxBufferSize]Properties // Per character info.
+ byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
+ nrune int // Number of runeInfos.
+ nbyte uint8 // Number or bytes.
f formInfo
src input
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
// It returns false if the buffer is not large enough to hold the rune.
// It is used internally by insert and insertString only.
-func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
+func (rb *reorderBuffer) insertOrdered(info Properties) bool {
n := rb.nrune
if n >= maxCombiningChars+1 {
return false
// insert inserts the given rune in the buffer ordered by CCC.
// It returns true if the buffer was large enough to hold the decomposed rune.
-func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
+func (rb *reorderBuffer) insert(src input, i int, info Properties) bool {
if rune := src.hangul(i); rune != 0 {
return rb.decomposeHangul(rune)
}
if info.hasDecomposition() {
- return rb.insertDecomposed(info.decomposition())
+ return rb.insertDecomposed(info.Decomposition())
}
return rb.insertSingle(src, i, info)
}
// insertSingle inserts an entry in the reorderBuffer for the rune at
// position i. info is the runeInfo for the rune at position i.
-func (rb *reorderBuffer) insertSingle(src input, i int, info runeInfo) bool {
+func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) bool {
// insertOrder changes nbyte
pos := rb.nbyte
if !rb.insertOrdered(info) {
bn := rb.nbyte
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.nbyte += utf8.UTFMax
- rb.rune[rb.nrune] = runeInfo{pos: bn, size: uint8(sz)}
+ rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
rb.nrune++
}
func (rb *reorderBuffer) assignRune(pos int, r rune) {
bn := rb.rune[pos].pos
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
- rb.rune[pos] = runeInfo{pos: bn, size: uint8(sz)}
+ rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
}
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
)
-// runeInfo is a representation for the data stored in charinfoTrie.
-type runeInfo struct {
+// Properties provides access to normalization properties of a rune.
+type Properties struct {
pos uint8 // start position in reorderBuffer; used in composition.go
size uint8 // length of UTF-8 encoding of this rune
ccc uint8 // leading canonical combining class (ccc if not decomposition)
}
// functions dispatchable per form
-type lookupFunc func(b input, i int) runeInfo
+type lookupFunc func(b input, i int) Properties
// formInfo holds Form-specific functions and tables.
type formInfo struct {
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
// unexpected behavior for the user. For example, in NFD, there is a boundary
-// after 'a'. However, a might combine with modifiers, so from the application's
+// after 'a'. However, 'a' might combine with modifiers, so from the application's
// perspective it is not a good boundary. We will therefore always use the
// boundaries for the combining variants.
-func (i runeInfo) boundaryBefore() bool {
- if i.ccc == 0 && !i.combinesBackward() {
+
+// BoundaryBefore returns true if this rune starts a new segment and
+// cannot combine with any rune on the left.
+func (p Properties) BoundaryBefore() bool {
+ if p.ccc == 0 && !p.combinesBackward() {
return true
}
// We assume that the CCC of the first character in a decomposition
return false
}
-func (i runeInfo) boundaryAfter() bool {
- return i.isInert()
+// BoundaryAfter returns true if this rune cannot combine with runes to the right
+// and always denotes the end of a segment.
+func (p Properties) BoundaryAfter() bool {
+ return p.isInert()
}
// We pack quick check data in 4 bits:
// influenced by normalization.
type qcInfo uint8
-func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
-func (i runeInfo) isYesD() bool { return i.flags&0x1 == 0 }
+func (p Properties) isYesC() bool { return p.flags&0x4 == 0 }
+func (p Properties) isYesD() bool { return p.flags&0x1 == 0 }
-func (i runeInfo) combinesForward() bool { return i.flags&0x8 != 0 }
-func (i runeInfo) combinesBackward() bool { return i.flags&0x2 != 0 } // == isMaybe
-func (i runeInfo) hasDecomposition() bool { return i.flags&0x1 != 0 } // == isNoD
+func (p Properties) combinesForward() bool { return p.flags&0x8 != 0 }
+func (p Properties) combinesBackward() bool { return p.flags&0x2 != 0 } // == isMaybe
+func (p Properties) hasDecomposition() bool { return p.flags&0x1 != 0 } // == isNoD
-func (r runeInfo) isInert() bool {
- return r.flags&0xf == 0 && r.ccc == 0
+func (p Properties) isInert() bool {
+ return p.flags&0xf == 0 && p.ccc == 0
}
-func (r runeInfo) decomposition() []byte {
- if r.index == 0 {
+// Decomposition returns the decomposition for the underlying rune
+// or nil if there is none.
+func (p Properties) Decomposition() []byte {
+ if p.index == 0 {
return nil
}
- p := r.index
- n := decomps[p] & 0x3F
- p++
- return decomps[p : p+uint16(n)]
+ i := p.index
+ n := decomps[i] & headerLenMask
+ i++
+ return decomps[i : i+uint16(n)]
+}
+
+// Size returns the length of UTF-8 encoding of the rune.
+func (p Properties) Size() int {
+ return int(p.size)
+}
+
+// CCC returns the canonical combining class of the underlying rune.
+func (p Properties) CCC() uint8 {
+ if p.index > firstCCCZeroExcept {
+ return 0
+ }
+ return p.ccc
+}
+
+// LeadCCC returns the CCC of the first rune in the decomposition.
+// If there is no decomposition, LeadCCC equals CCC.
+func (p Properties) LeadCCC() uint8 {
+ return p.ccc
+}
+
+// TrailCCC returns the CCC of the last rune in the decomposition.
+// If there is no decomposition, TrailCCC equals CCC.
+func (p Properties) TrailCCC() uint8 {
+ return p.tccc
}
// Recomposition
return recompMap[key]
}
-func lookupInfoNFC(b input, i int) runeInfo {
+func lookupInfoNFC(b input, i int) Properties {
v, sz := b.charinfoNFC(i)
return compInfo(v, sz)
}
-func lookupInfoNFKC(b input, i int) runeInfo {
+func lookupInfoNFKC(b input, i int) Properties {
v, sz := b.charinfoNFKC(i)
return compInfo(v, sz)
}
+// Properties returns properties for the first rune in s.
+func (f Form) Properties(s []byte) Properties {
+ if f == NFC || f == NFD {
+ return compInfo(nfcTrie.lookup(s))
+ }
+ return compInfo(nfkcTrie.lookup(s))
+}
+
+// PropertiesString returns properties for the first rune in s.
+func (f Form) PropertiesString(s string) Properties {
+ if f == NFC || f == NFD {
+ return compInfo(nfcTrie.lookupString(s))
+ }
+ return compInfo(nfkcTrie.lookupString(s))
+}
+
// compInfo converts the information contained in v and sz
-// to a runeInfo. See the comment at the top of the file
+// to a Properties. See the comment at the top of the file
// for more information on the format.
-func compInfo(v uint16, sz int) runeInfo {
+func compInfo(v uint16, sz int) Properties {
if v == 0 {
- return runeInfo{size: uint8(sz)}
+ return Properties{size: uint8(sz)}
} else if v >= 0x8000 {
- return runeInfo{
+ return Properties{
size: uint8(sz),
ccc: uint8(v),
tccc: uint8(v),
// has decomposition
h := decomps[v]
f := (qcInfo(h&headerFlagsMask) >> 4) | 0x1
- ri := runeInfo{size: uint8(sz), flags: f, index: v}
+ ri := Properties{size: uint8(sz), flags: f, index: v}
if v >= firstCCC {
v += uint16(h&headerLenMask) + 1
ri.tccc = decomps[v]
// to a given Form.
type Iter struct {
rb reorderBuffer
- info runeInfo // first character saved from previous iteration
- next iterFunc // implementation of next depends on form
+ info Properties // first character saved from previous iteration
+ next iterFunc // implementation of next depends on form
p int // current position in input source
outStart int // start of current segment in output buffer
break
}
}
- } else if d := i.info.decomposition(); d != nil {
+ } else if d := i.info.Decomposition(); d != nil {
i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
p := outp + len(d)
if p > i.maxseg && i.setStart(outp, i.p) {
if i.setStart(outp-1, i.p-1) {
i.p--
outp--
- i.info = runeInfo{size: 1}
+ i.info = Properties{size: 1}
break
}
}
return outp
}
i.info = i.rb.f.info(i.rb.src, i.p)
- if i.info.boundaryBefore() {
+ if i.info.BoundaryBefore() {
break
}
}
lccc := ccc(d[0])
tccc := ccc(d[len(d)-1])
+ cc := ccc(r)
+ if cc != 0 && lccc == 0 && tccc == 0 {
+ logger.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", cc)
+ }
if tccc < lccc && lccc != 0 {
const msg = "%U: lccc (%d) must be <= tcc (%d)"
logger.Fatalf(msg, r, lccc, tccc)
index = 1
if lccc > 0 {
s += string([]byte{lccc})
- index |= 2
+ index = 2
+ }
+ if cc != lccc {
+ if cc != 0 {
+ logger.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", cc)
+ }
+ index = 3
}
}
return index, s
size := 0
positionMap := make(map[string]uint16)
decompositions.WriteString("\000")
- cname := []string{"firstCCC", "firstLeadingCCC", "", "lastDecomp"}
+ cname := []string{"firstCCC", "firstLeadingCCC", "firstCCCZeroExcept", "lastDecomp"}
fmt.Println("const (")
for i, m := range decompSet {
sa := []string{}
}
fd := &rb.f
if doMerge {
- var info runeInfo
+ var info Properties
if p < n {
info = fd.info(src, p)
- if p == 0 && !info.boundaryBefore() {
+ if p == 0 && !info.BoundaryBefore() {
out = decomposeToLastBoundary(rb, out)
}
}
- if info.size == 0 || info.boundaryBefore() {
+ if info.size == 0 || info.BoundaryBefore() {
if fd.composing {
rb.compose()
}
}
fd := &rb.f
info := fd.info(src, i)
- for n := 0; info.size != 0 && !info.boundaryBefore(); {
+ for n := 0; info.size != 0 && !info.BoundaryBefore(); {
i += int(info.size)
if n++; n >= maxCombiningChars {
return i
}
if i >= nsrc {
- if !info.boundaryAfter() {
+ if !info.BoundaryAfter() {
return -1
}
return nsrc
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
return i
}
- if info.boundaryAfter() {
+ if info.BoundaryAfter() {
return i
}
i = p
- for n := 0; i >= 0 && !info.boundaryBefore(); {
+ for n := 0; i >= 0 && !info.BoundaryBefore(); {
info, p = lastRuneStart(fd, b[:i])
if n++; n >= maxCombiningChars {
return len(b)
break
}
info = rb.f.info(rb.src, sp)
- bound := info.boundaryBefore()
+ bound := info.BoundaryBefore()
if bound || info.size == 0 {
break
}
// lastRuneStart returns the runeInfo and position of the last
// rune in buf or the zero runeInfo and -1 if no rune was found.
-func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
+func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
p := len(buf) - 1
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
}
if p < 0 {
- return runeInfo{}, -1
+ return Properties{}, -1
}
return fd.info(inputBytes(buf), p), p
}
// illegal trailing continuation bytes
return buf
}
- if info.boundaryAfter() {
+ if info.BoundaryAfter() {
return buf
}
- var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
+ var add [maxBackRunes]Properties // stores runeInfo in reverse order
add[0] = info
padd := 1
n := 1
p := len(buf) - int(info.size)
- for ; p >= 0 && !info.boundaryBefore(); p -= int(info.size) {
+ for ; p >= 0 && !info.BoundaryBefore(); p -= int(info.size) {
info, i = lastRuneStart(fd, buf[:p])
if int(info.size) != p-i {
break
i += int(info.size)
n++
} else {
- dcomp := info.decomposition()
+ dcomp := info.Decomposition()
for i := 0; i < len(dcomp); {
inf := rb.f.info(inputBytes(dcomp), i)
i += int(inf.size)
const Version = "6.0.0"
const (
- firstCCC = 0x2E45
- firstLeadingCCC = 0x4965
- lastDecomp = 0x49A2
- maxDecomp = 0x8000
+ firstCCC = 0x2E45
+ firstLeadingCCC = 0x4965
+ firstCCCZeroExcept = 0x497B
+ lastDecomp = 0x49A2
+ maxDecomp = 0x8000
)
// decomps: 18850 bytes
0xCC, 0x94, 0xCC, 0x81, 0xE6, 0x86, 0xCF, 0x89,
0xCC, 0x94, 0xCD, 0x82, 0xE6, 0x42, 0xCC, 0x80,
0xE6, 0xE6, 0x42, 0xCC, 0x81, 0xE6, 0xE6, 0x42,
- 0xCC, 0x93, 0xE6, 0xE6, 0x43, 0xE3, 0x82, 0x99,
- 0x08, 0x08, 0x43, 0xE3, 0x82, 0x9A, 0x08, 0x08,
+ 0xCC, 0x93, 0xE6, 0xE6, 0x44, 0xCC, 0x88, 0xCC,
+ 0x81, 0xE6, 0xE6, 0x43, 0xE3, 0x82, 0x99, 0x08,
// Bytes 4980 - 49bf
- 0x44, 0xCC, 0x88, 0xCC, 0x81, 0xE6, 0xE6, 0x46,
+ 0x08, 0x43, 0xE3, 0x82, 0x9A, 0x08, 0x08, 0x46,
0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2, 0x82, 0x81,
0x46, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0x84,
0x81, 0x46, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80,
0x0236: 0x8001, 0x0237: 0x8001, 0x0238: 0x8601, 0x0239: 0x80dc, 0x023a: 0x80dc, 0x023b: 0x80dc,
0x023c: 0x80dc, 0x023d: 0x80e6, 0x023e: 0x80e6, 0x023f: 0x80e6,
// Block 0x9, offset 0x240
- 0x0240: 0x4965, 0x0241: 0x496a, 0x0242: 0x86e6, 0x0243: 0x496f, 0x0244: 0x4980, 0x0245: 0x86f0,
+ 0x0240: 0x4965, 0x0241: 0x496a, 0x0242: 0x86e6, 0x0243: 0x496f, 0x0244: 0x4974, 0x0245: 0x86f0,
0x0246: 0x80e6, 0x0247: 0x80dc, 0x0248: 0x80dc, 0x0249: 0x80dc, 0x024a: 0x80e6, 0x024b: 0x80e6,
0x024c: 0x80e6, 0x024d: 0x80dc, 0x024e: 0x80dc, 0x0250: 0x80e6, 0x0251: 0x80e6,
0x0252: 0x80e6, 0x0253: 0x80dc, 0x0254: 0x80dc, 0x0255: 0x80dc, 0x0256: 0x80dc, 0x0257: 0x80e6,
0x0236: 0x8001, 0x0237: 0x8001, 0x0238: 0x8601, 0x0239: 0x80dc, 0x023a: 0x80dc, 0x023b: 0x80dc,
0x023c: 0x80dc, 0x023d: 0x80e6, 0x023e: 0x80e6, 0x023f: 0x80e6,
// Block 0x9, offset 0x240
- 0x0240: 0x4965, 0x0241: 0x496a, 0x0242: 0x86e6, 0x0243: 0x496f, 0x0244: 0x4980, 0x0245: 0x86f0,
+ 0x0240: 0x4965, 0x0241: 0x496a, 0x0242: 0x86e6, 0x0243: 0x496f, 0x0244: 0x4974, 0x0245: 0x86f0,
0x0246: 0x80e6, 0x0247: 0x80dc, 0x0248: 0x80dc, 0x0249: 0x80dc, 0x024a: 0x80e6, 0x024b: 0x80e6,
0x024c: 0x80e6, 0x024d: 0x80dc, 0x024e: 0x80dc, 0x0250: 0x80e6, 0x0251: 0x80e6,
0x0252: 0x80e6, 0x0253: 0x80dc, 0x0254: 0x80dc, 0x0255: 0x80dc, 0x0256: 0x80dc, 0x0257: 0x80e6,
0x124c: 0x0a89, 0x124d: 0x0a8d, 0x124e: 0x0a91, 0x124f: 0x0a95, 0x1250: 0x0a99, 0x1251: 0x0a9d,
0x1252: 0x0aa1, 0x1253: 0x0aa5, 0x1254: 0x0aad, 0x1255: 0x0ab5, 0x1256: 0x0abd, 0x1257: 0x0ac1,
0x1258: 0x0ac5, 0x1259: 0x0ac9, 0x125a: 0x0acd, 0x125b: 0x0ad1, 0x125c: 0x0ad5, 0x125d: 0x0ae5,
- 0x125e: 0x4974, 0x125f: 0x497a, 0x1260: 0x0889, 0x1261: 0x07d9, 0x1262: 0x07dd, 0x1263: 0x0901,
+ 0x125e: 0x497b, 0x125f: 0x4981, 0x1260: 0x0889, 0x1261: 0x07d9, 0x1262: 0x07dd, 0x1263: 0x0901,
0x1264: 0x07e1, 0x1265: 0x0905, 0x1266: 0x0909, 0x1267: 0x07e5, 0x1268: 0x07e9, 0x1269: 0x07ed,
0x126a: 0x090d, 0x126b: 0x0911, 0x126c: 0x0915, 0x126d: 0x0919, 0x126e: 0x091d, 0x126f: 0x0921,
0x1270: 0x082d, 0x1271: 0x07f1, 0x1272: 0x07f5, 0x1273: 0x07f9, 0x1274: 0x0841, 0x1275: 0x07fd,