From: Marcel van Lohuizen Date: Fri, 13 Jul 2012 09:38:00 +0000 (+0200) Subject: exp/locale/collate: adjusted contraction trie to support Myanmar (Burmese), X-Git-Tag: go1.1rc2~2807 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=adc19ac5e3fc4914df4b09400686c964e379532b;p=gostls13.git exp/locale/collate: adjusted contraction trie to support Myanmar (Burmese), which has a rather large contraction table. The value of the next state offset now starts after the current block, instead of before. This is slightly less efficient (on extra addition per state change), but gives some extra range for the offsets. Also introduced constants for final (0) and noIndex (0xFF). tables.go is updated in a separate CL. R=r CC=golang-dev https://golang.org/cl/6346092 --- diff --git a/src/pkg/exp/locale/collate/build/contract.go b/src/pkg/exp/locale/collate/build/contract.go index 1f8691ba04..45d8f74b9b 100644 --- a/src/pkg/exp/locale/collate/build/contract.go +++ b/src/pkg/exp/locale/collate/build/contract.go @@ -41,6 +41,11 @@ import ( // also includes the length and offset to the next sequence of entries // to check in case of a match. +const ( + final = 0 + noIndex = 0xFF +) + // ctEntry associates to a matching byte an offset and/or next sequence of // bytes to check. A ctEntry c is called final if a match means that the // longest suffix has been found. An entry c is final if c.n == 0. @@ -50,24 +55,24 @@ import ( // Examples: // The suffix strings "ab" and "ac" can be represented as: // []ctEntry{ -// {'a', 1, 1, 0xFF}, // 'a' by itself does not match, so i is 0xFF. +// {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF. // {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 // } // // The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: // []ctEntry{ -// {'a', 1, 1, 0xFF}, // 'a' must be followed by 'b'. -// {'b', 2, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. -// {'d', 'd', 0, 3}, // "abd" -> 3 +// {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'. +// {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. +// {'d', 'd', final, 3}, // "abd" -> 3 // {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. -// {'d', 'd', 0, 4}, // "abcd" -> 4 +// {'d', 'd', final, 4}, // "abcd" -> 4 // } // See genStateTests in contract_test.go for more examples. type ctEntry struct { l uint8 // non-final: byte value to match; final: lowest match in range. h uint8 // non-final: relative index to next block; final: highest match in range. - n uint8 // non-final: length of next block; final: 0 - i uint8 // result offset. Will be 0xFF if more bytes are needed to complete. + n uint8 // non-final: length of next block; final: final + i uint8 // result offset. Will be noIndex if more bytes are needed to complete. } // contractTrieSet holds a set of contraction tries. The tries are stored @@ -124,7 +129,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) { } } if !added { - *ct = append(*ct, ctEntry{l: c, i: 0xFF}) + *ct = append(*ct, ctEntry{l: c, i: noIndex}) } } else { for j := len(*ct) - 1; j >= start; j-- { @@ -140,7 +145,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) { } } if !added { - *ct = append(*ct, ctEntry{l: c, h: c, i: uint8(si.index)}) + *ct = append(*ct, ctEntry{l: c, h: c, n: final, i: uint8(si.index)}) } } } @@ -150,7 +155,7 @@ func (ct *contractTrieSet) genStates(sis []stridx) (int, error) { for i, end := start, len(*ct); i < end; i++ { fe := (*ct)[i] if fe.h == 0 { // uninitialized non-final - ln := len(*ct) - start + ln := len(*ct) - start - n if ln > 0xFF { return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) } @@ -238,16 +243,16 @@ func (ct *contractTrieSet) lookup(h ctHandle, str []byte) (index, ns int) { if c >= e.l { p++ if e.l == c { - if e.i != 0xFF { + if e.i != noIndex { index, ns = int(e.i), p } - if e.n != 0 { + if e.n != final { // set to new state - i, states, n = 0, states[e.h:], int(e.n) + i, states, n = 0, states[int(e.h)+n:], int(e.n) } else { return } - } else if e.n == 0 && c <= e.h { + } else if e.n == final && c <= e.h { return int(c-e.l) + int(e.i), p } } else { diff --git a/src/pkg/exp/locale/collate/build/contract_test.go b/src/pkg/exp/locale/collate/build/contract_test.go index ea5f3c077a..eeca0c8954 100644 --- a/src/pkg/exp/locale/collate/build/contract_test.go +++ b/src/pkg/exp/locale/collate/build/contract_test.go @@ -111,9 +111,9 @@ var genStateTests = []GenStateTest{ }, 1, contractTrieSet{ - {'a', 1, 1, 0xFF}, - {'b', 1, 1, 0xFF}, - {'c', 'c', 0, 1}, + {'a', 0, 1, noIndex}, + {'b', 0, 1, noIndex}, + {'c', 'c', final, 1}, }, }, {[]stridx{ @@ -123,9 +123,9 @@ var genStateTests = []GenStateTest{ }, 1, contractTrieSet{ - {'a', 1, 1, 0xFF}, - {'b', 1, 1, 0xFF}, - {'c', 'e', 0, 1}, + {'a', 0, 1, noIndex}, + {'b', 0, 1, noIndex}, + {'c', 'e', final, 1}, }, }, {[]stridx{ @@ -135,9 +135,9 @@ var genStateTests = []GenStateTest{ }, 1, contractTrieSet{ - {'a', 1, 1, 3}, - {'b', 1, 1, 2}, - {'c', 'c', 0, 1}, + {'a', 0, 1, 3}, + {'b', 0, 1, 2}, + {'c', 'c', final, 1}, }, }, {[]stridx{ @@ -150,11 +150,11 @@ var genStateTests = []GenStateTest{ }, 2, contractTrieSet{ - {'b', 'b', 0, 6}, - {'a', 2, 2, 5}, - {'c', 'c', 0, 4}, - {'b', 2, 1, 3}, - {'c', 'd', 0, 1}, + {'b', 'b', final, 6}, + {'a', 0, 2, 5}, + {'c', 'c', final, 4}, + {'b', 0, 1, 3}, + {'c', 'd', final, 1}, }, }, {[]stridx{ @@ -168,14 +168,14 @@ var genStateTests = []GenStateTest{ }, 2, contractTrieSet{ - {'b', 5, 1, 0xFF}, - {'a', 2, 1, 0xFF}, - {'b', 1, 1, 6}, - {'c', 1, 1, 4}, - {'d', 'd', 0, 1}, - {'c', 1, 1, 7}, - {'d', 1, 1, 5}, - {'e', 'f', 0, 2}, + {'b', 3, 1, noIndex}, + {'a', 0, 1, noIndex}, + {'b', 0, 1, 6}, + {'c', 0, 1, 4}, + {'d', 'd', final, 1}, + {'c', 0, 1, 7}, + {'d', 0, 1, 5}, + {'e', 'f', final, 2}, }, }, } @@ -251,13 +251,13 @@ func TestPrintContractionTrieSet(t *testing.T) { const contractTrieOutput = `// testCTEntries: 8 entries, 32 bytes var testCTEntries = [8]struct{l,h,n,i uint8}{ - {0x62, 0x5, 1, 255}, - {0x61, 0x2, 1, 255}, - {0x62, 0x1, 1, 6}, - {0x63, 0x1, 1, 4}, + {0x62, 0x3, 1, 255}, + {0x61, 0x0, 1, 255}, + {0x62, 0x0, 1, 6}, + {0x63, 0x0, 1, 4}, {0x64, 0x64, 0, 1}, - {0x63, 0x1, 1, 7}, - {0x64, 0x1, 1, 5}, + {0x63, 0x0, 1, 7}, + {0x64, 0x0, 1, 5}, {0x65, 0x66, 0, 2}, } var testContractTrieSet = contractTrieSet( testCTEntries[:] ) diff --git a/src/pkg/exp/locale/collate/contract.go b/src/pkg/exp/locale/collate/contract.go index 28b9a04aca..0d9bc401bc 100644 --- a/src/pkg/exp/locale/collate/contract.go +++ b/src/pkg/exp/locale/collate/contract.go @@ -37,6 +37,11 @@ func (s *ctScanner) result() (i, p int) { return s.index, s.pindex } +const ( + final = 0 + noIndex = 0xFF +) + // scan matches the longest suffix at the current location in the input // and returns the number of bytes consumed. func (s *ctScanner) scan(p int) int { @@ -53,12 +58,12 @@ func (s *ctScanner) scan(p int) int { if c >= e.l { if e.l == c { p++ - if e.i != 0xFF { + if e.i != noIndex { s.index = int(e.i) s.pindex = p } - if e.n != 0 { - i, states, n = 0, states[e.h:], int(e.n) + if e.n != final { + i, states, n = 0, states[int(e.h)+n:], int(e.n) if p >= len(str) || utf8.RuneStart(str[p]) { s.states, s.n, pr = states, n, p } @@ -67,7 +72,7 @@ func (s *ctScanner) scan(p int) int { return p } continue - } else if e.n == 0 && c <= e.h { + } else if e.n == final && c <= e.h { p++ s.done = true s.index = int(c-e.l) + int(e.i) diff --git a/src/pkg/exp/locale/collate/contract_test.go b/src/pkg/exp/locale/collate/contract_test.go index fd94d9c5c0..f3710a183a 100644 --- a/src/pkg/exp/locale/collate/contract_test.go +++ b/src/pkg/exp/locale/collate/contract_test.go @@ -30,8 +30,8 @@ var lookupTests = []LookupTest{ }, 1, contractTrieSet{ - {'a', 1, 1, 0xFF}, - {'b', 1, 1, 0xFF}, + {'a', 0, 1, 0xFF}, + {'b', 0, 1, 0xFF}, {'c', 'c', 0, 1}, }, }, @@ -46,8 +46,8 @@ var lookupTests = []LookupTest{ }, 1, contractTrieSet{ - {'a', 1, 1, 0xFF}, - {'b', 1, 1, 0xFF}, + {'a', 0, 1, 0xFF}, + {'b', 0, 1, 0xFF}, {'c', 'e', 0, 1}, }, }, @@ -60,8 +60,8 @@ var lookupTests = []LookupTest{ }, 1, contractTrieSet{ - {'a', 1, 1, 3}, - {'b', 1, 1, 2}, + {'a', 0, 1, 3}, + {'b', 0, 1, 2}, {'c', 'c', 0, 1}, }, }, @@ -77,9 +77,9 @@ var lookupTests = []LookupTest{ 2, contractTrieSet{ {'b', 'b', 0, 6}, - {'a', 2, 2, 5}, + {'a', 0, 2, 5}, {'c', 'c', 0, 4}, - {'b', 2, 1, 3}, + {'b', 0, 1, 3}, {'c', 'd', 0, 1}, }, }, @@ -94,13 +94,13 @@ var lookupTests = []LookupTest{ }, 2, contractTrieSet{ - {'b', 5, 1, 0xFF}, - {'a', 2, 1, 0xFF}, - {'b', 1, 1, 6}, - {'c', 1, 1, 4}, + {'b', 3, 1, 0xFF}, + {'a', 0, 1, 0xFF}, + {'b', 0, 1, 6}, + {'c', 0, 1, 4}, {'d', 'd', 0, 1}, - {'c', 1, 1, 7}, - {'d', 1, 1, 5}, + {'c', 0, 1, 7}, + {'d', 0, 1, 5}, {'e', 'f', 0, 2}, }, },