exp/norm: Added Iter type for iterating on segment boundaries. This type is mainly...

author Marcel van Lohuizen <mpvl@golang.org>

Tue, 21 Feb 2012 12:13:21 +0000 (13:13 +0100)

committer Marcel van Lohuizen <mpvl@golang.org>

Tue, 21 Feb 2012 12:13:21 +0000 (13:13 +0100)
author Marcel van Lohuizen <mpvl@golang.org>
Tue, 21 Feb 2012 12:13:21 +0000 (13:13 +0100)
committer Marcel van Lohuizen <mpvl@golang.org>
Tue, 21 Feb 2012 12:13:21 +0000 (13:13 +0100)
diff --git a/src/pkg/exp/norm/composition.go b/src/pkg/exp/norm/composition.go

index ccff4670602179e9d1ae6bf792234aaf166686f8..2cbe1ac730e43a24f6de10c137c8353bb6eaf69c 100644 (file)
--- a/src/pkg/exp/norm/composition.go
+++ b/src/pkg/exp/norm/composition.go
@@ -66,6 +66,18 @@ func (rb *reorderBuffer) flush(out []byte) []byte {
         return out
  }
  
+// flushCopy copies the normalized segment to buf and resets rb.
+// It returns the number of bytes written to buf.
+func (rb *reorderBuffer) flushCopy(buf []byte) int {
+       p := 0
+       for i := 0; i < rb.nrune; i++ {
+               runep := rb.rune[i]
+               p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
+       }
+       rb.reset()
+       return p
+}
+
  // insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
  // It returns false if the buffer is not large enough to hold the rune.
  // It is used internally by insert and insertString only.
@@ -96,32 +108,41 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
  // insert inserts the given rune in the buffer ordered by CCC.
  // It returns true if the buffer was large enough to hold the decomposed rune.
  func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
-       if info.size == 3 {
-               if rune := src.hangul(i); rune != 0 {
-                       return rb.decomposeHangul(rune)
-               }
+       if rune := src.hangul(i); rune != 0 {
+               return rb.decomposeHangul(rune)
         }
         if info.hasDecomposition() {
-               dcomp := info.decomposition()
-               rb.tmpBytes = inputBytes(dcomp)
-               for i := 0; i < len(dcomp); {
-                       info = rb.f.info(&rb.tmpBytes, i)
-                       pos := rb.nbyte
-                       if !rb.insertOrdered(info) {
-                               return false
-                       }
-                       end := i + int(info.size)
-                       copy(rb.byte[pos:], dcomp[i:end])
-                       i = end
-               }
-       } else {
-               // insertOrder changes nbyte
+               return rb.insertDecomposed(info.decomposition())
+       }
+       return rb.insertSingle(src, i, info)
+}
+
+// insertDecomposed inserts an entry in to the reorderBuffer for each rune
+// in dcomp.  dcomp must be a sequence of decomposed UTF-8-encoded runes.
+func (rb *reorderBuffer) insertDecomposed(dcomp []byte) bool {
+       saveNrune, saveNbyte := rb.nrune, rb.nbyte
+       rb.tmpBytes = inputBytes(dcomp)
+       for i := 0; i < len(dcomp); {
+               info := rb.f.info(&rb.tmpBytes, i)
                 pos := rb.nbyte
                 if !rb.insertOrdered(info) {
+                       rb.nrune, rb.nbyte = saveNrune, saveNbyte
                         return false
                 }
-               src.copySlice(rb.byte[pos:], i, i+int(info.size))
+               i += copy(rb.byte[pos:], dcomp[i:i+int(info.size)])
+       }
+       return true
+}
+
+// insertSingle inserts an entry in the reorderBuffer for the rune at
+// position i. info is the runeInfo for the rune at position i.
+func (rb *reorderBuffer) insertSingle(src input, i int, info runeInfo) bool {
+       // insertOrder changes nbyte
+       pos := rb.nbyte
+       if !rb.insertOrdered(info) {
+               return false
         }
+       src.copySlice(rb.byte[pos:], i, i+int(info.size))
         return true
  }
  
@@ -182,8 +203,12 @@ const (
         jamoLVTCount = 19 * 21 * 28
  )
  
-// Caller must verify that len(b) >= 3.
+const hangulUTF8Size = 3
+
  func isHangul(b []byte) bool {
+       if len(b) < hangulUTF8Size {
+               return false
+       }
         b0 := b[0]
         if b0 < hangulBase0 {
                 return false
@@ -202,8 +227,10 @@ func isHangul(b []byte) bool {
         return b1 == hangulEnd1 && b[2] < hangulEnd2
  }
  
-// Caller must verify that len(b) >= 3.
  func isHangulString(b string) bool {
+       if len(b) < hangulUTF8Size {
+               return false
+       }
         b0 := b[0]
         if b0 < hangulBase0 {
                 return false
@@ -234,6 +261,22 @@ func isHangulWithoutJamoT(b []byte) bool {
         return c < jamoLVTCount && c%jamoTCount == 0
  }
  
+// decomposeHangul writes the decomposed Hangul to buf and returns the number
+// of bytes written.  len(buf) should be at least 9.
+func decomposeHangul(buf []byte, r rune) int {
+       const JamoUTF8Len = 3
+       r -= hangulBase
+       x := r % jamoTCount
+       r /= jamoTCount
+       utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
+       utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
+       if x != 0 {
+               utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
+               return 3 * JamoUTF8Len
+       }
+       return 2 * JamoUTF8Len
+}
+
  // decomposeHangul algorithmically decomposes a Hangul rune into
  // its Jamo components.
  // See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
diff --git a/src/pkg/exp/norm/composition_test.go b/src/pkg/exp/norm/composition_test.go

index e32380d7afac6f822bcccfd4e12f4a3fc167da03..9de9eacfd653f308cc1032a25ad6daa06c4e96f5 100644 (file)
--- a/src/pkg/exp/norm/composition_test.go
+++ b/src/pkg/exp/norm/composition_test.go
@@ -47,14 +47,14 @@ func runTests(t *testing.T, name string, fm Form, f insertFunc, tests []TestCase
         }
  }
  
-func TestFlush(t *testing.T) {
+type flushFunc func(rb *reorderBuffer) []byte
+
+func testFlush(t *testing.T, name string, fn flushFunc) {
         rb := reorderBuffer{}
         rb.init(NFC, nil)
-       out := make([]byte, 0)
-
-       out = rb.flush(out)
+       out := fn(&rb)
         if len(out) != 0 {
-               t.Errorf("wrote bytes on flush of empty buffer. (len(out) = %d)", len(out))
+               t.Errorf("%s: wrote bytes on flush of empty buffer. (len(out) = %d)", name, len(out))
         }
  
         for _, r := range []rune("world!") {
@@ -65,16 +65,32 @@ func TestFlush(t *testing.T) {
         out = rb.flush(out)
         want := "Hello world!"
         if string(out) != want {
-               t.Errorf(`output after flush was "%s"; want "%s"`, string(out), want)
+               t.Errorf(`%s: output after flush was "%s"; want "%s"`, name, string(out), want)
         }
         if rb.nrune != 0 {
-               t.Errorf("flush: non-null size of info buffer (rb.nrune == %d)", rb.nrune)
+               t.Errorf("%s: non-null size of info buffer (rb.nrune == %d)", name, rb.nrune)
         }
         if rb.nbyte != 0 {
-               t.Errorf("flush: non-null size of byte buffer (rb.nbyte == %d)", rb.nbyte)
+               t.Errorf("%s: non-null size of byte buffer (rb.nbyte == %d)", name, rb.nbyte)
         }
  }
  
+func flushF(rb *reorderBuffer) []byte {
+       out := make([]byte, 0)
+       return rb.flush(out)
+}
+
+func flushCopyF(rb *reorderBuffer) []byte {
+       out := make([]byte, MaxSegmentSize)
+       n := rb.flushCopy(out)
+       return out[:n]
+}
+
+func TestFlush(t *testing.T) {
+       testFlush(t, "flush", flushF)
+       testFlush(t, "flushCopy", flushCopyF)
+}
+
  var insertTests = []TestCase{
         {[]rune{'a'}, []rune{'a'}},
         {[]rune{0x300}, []rune{0x300}},
diff --git a/src/pkg/exp/norm/input.go b/src/pkg/exp/norm/input.go

index 5c0968ba58cd5d7611fd0fa8a6188d36c41d5c82..9c564d67718b5f7a6be2b374cce59cae2b415c75 100644 (file)
--- a/src/pkg/exp/norm/input.go
+++ b/src/pkg/exp/norm/input.go
@@ -7,7 +7,7 @@ package norm
  import "unicode/utf8"
  
  type input interface {
-       skipASCII(p int) int
+       skipASCII(p, max int) int
         skipNonStarter(p int) int
         appendSlice(buf []byte, s, e int) []byte
         copySlice(buf []byte, s, e int)
@@ -18,8 +18,8 @@ type input interface {
  
  type inputString string
  
-func (s inputString) skipASCII(p int) int {
-       for ; p < len(s) && s[p] < utf8.RuneSelf; p++ {
+func (s inputString) skipASCII(p, max int) int {
+       for ; p < max && s[p] < utf8.RuneSelf; p++ {
         }
         return p
  }
@@ -59,8 +59,8 @@ func (s inputString) hangul(p int) rune {
  
  type inputBytes []byte
  
-func (s inputBytes) skipASCII(p int) int {
-       for ; p < len(s) && s[p] < utf8.RuneSelf; p++ {
+func (s inputBytes) skipASCII(p, max int) int {
+       for ; p < max && s[p] < utf8.RuneSelf; p++ {
         }
         return p
  }
diff --git a/src/pkg/exp/norm/iter.go b/src/pkg/exp/norm/iter.go

new file mode 100644 (file)

index 0000000..761ba90
--- /dev/null
+++ b/src/pkg/exp/norm/iter.go
@@ -0,0 +1,286 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package norm
+
+const MaxSegmentSize = maxByteBufferSize
+
+// An Iter iterates over a string or byte slice, while normalizing it
+// to a given Form.
+type Iter struct {
+       rb   reorderBuffer
+       info runeInfo // first character saved from previous iteration
+       next iterFunc // implementation of next depends on form
+
+       p        int // current position in input source
+       outStart int // start of current segment in output buffer
+       inStart  int // start of current segment in input source
+       maxp     int // position in output buffer after which not to start a new segment
+       maxseg   int // for tracking an excess of combining characters
+
+       tccc uint8
+       done bool
+}
+
+type iterFunc func(*Iter, []byte) int
+
+// SetInput initializes i to iterate over src after normalizing it to Form f.
+func (i *Iter) SetInput(f Form, src []byte) {
+       i.rb.init(f, src)
+       if i.rb.f.composing {
+               i.next = nextComposed
+       } else {
+               i.next = nextDecomposed
+       }
+       i.p = 0
+       if i.done = len(src) == 0; !i.done {
+               i.info = i.rb.f.info(i.rb.src, i.p)
+       }
+}
+
+// SetInputString initializes i to iterate over src after normalizing it to Form f.
+func (i *Iter) SetInputString(f Form, src string) {
+       i.rb.initString(f, src)
+       if i.rb.f.composing {
+               i.next = nextComposed
+       } else {
+               i.next = nextDecomposed
+       }
+       i.p = 0
+       if i.done = len(src) == 0; !i.done {
+               i.info = i.rb.f.info(i.rb.src, i.p)
+       }
+}
+
+// Pos returns the byte position at which the next call to Next will commence processing.
+func (i *Iter) Pos() int {
+       return i.p
+}
+
+// Done returns true if there is no more input to process.
+func (i *Iter) Done() bool {
+       return i.done
+}
+
+// Next writes f(i.input[i.Pos():n]...) to buffer buf, where n is the
+// largest boundary of i.input such that the result fits in buf.  
+// It returns the number of bytes written to buf.
+// len(buf) should be at least MaxSegmentSize. 
+// Done must be false before calling Next.
+func (i *Iter) Next(buf []byte) int {
+       return i.next(i, buf)
+}
+
+func (i *Iter) initNext(outn, inStart int) {
+       i.outStart = 0
+       i.inStart = inStart
+       i.maxp = outn - MaxSegmentSize
+       i.maxseg = MaxSegmentSize
+}
+
+// setStart resets the start of the new segment to the given position.
+// It returns true if there is not enough room for the new segment.
+func (i *Iter) setStart(outp, inp int) bool {
+       if outp > i.maxp {
+               return true
+       }
+       i.outStart = outp
+       i.inStart = inp
+       i.maxseg = outp + MaxSegmentSize
+       return false
+}
+
+func min(a, b int) int {
+       if a < b {
+               return a
+       }
+       return b
+}
+
+// nextDecomposed is the implementation of Next for forms NFD and NFKD.
+func nextDecomposed(i *Iter, out []byte) int {
+       var outp int
+       i.initNext(len(out), i.p)
+doFast:
+       inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart
+       for {
+               if sz := int(i.info.size); sz <= 1 {
+                       // ASCII or illegal byte.  Either way, advance by 1.
+                       i.p++
+                       outp++
+                       max := min(i.rb.nsrc, len(out)-outp+i.p)
+                       if np := i.rb.src.skipASCII(i.p, max); np > i.p {
+                               outp += np - i.p
+                               i.p = np
+                               if i.p >= i.rb.nsrc {
+                                       break
+                               }
+                               // ASCII may combine with consecutive runes.
+                               if i.setStart(outp-1, i.p-1) {
+                                       i.p--
+                                       outp--
+                                       i.info.size = 1
+                                       break
+                               }
+                       }
+               } else if d := i.info.decomposition(); d != nil {
+                       i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
+                       p := outp + len(d)
+                       if p > i.maxseg && i.setStart(outp, i.p) {
+                               return outp
+                       }
+                       copy(out[outp:], d)
+                       outp = p
+                       i.p += sz
+                       inCopyStart, outCopyStart = i.p, outp
+               } else if r := i.rb.src.hangul(i.p); r != 0 {
+                       i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
+                       for {
+                               outp += decomposeHangul(out[outp:], r)
+                               i.p += hangulUTF8Size
+                               if r = i.rb.src.hangul(i.p); r == 0 {
+                                       break
+                               }
+                               if i.setStart(outp, i.p) {
+                                       return outp
+                               }
+                       }
+                       inCopyStart, outCopyStart = i.p, outp
+               } else {
+                       p := outp + sz
+                       if p > i.maxseg && i.setStart(outp, i.p) {
+                               break
+                       }
+                       outp = p
+                       i.p += sz
+               }
+               if i.p >= i.rb.nsrc {
+                       break
+               }
+               prevCC := i.info.tccc
+               i.info = i.rb.f.info(i.rb.src, i.p)
+               if cc := i.info.ccc; cc == 0 {
+                       if i.setStart(outp, i.p) {
+                               break
+                       }
+               } else if cc < prevCC {
+                       goto doNorm
+               }
+       }
+       if inCopyStart != i.p {
+               i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
+       }
+       i.done = i.p >= i.rb.nsrc
+       return outp
+doNorm:
+       // Insert what we have decomposed so far in the reorderBuffer.
+       // As we will only reorder, there will always be enough room.
+       i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
+       if !i.rb.insertDecomposed(out[i.outStart:outp]) {
+               // Start over to prevent decompositions from crossing segment boundaries.
+               // This is a rare occurance.
+               i.p = i.inStart
+               i.info = i.rb.f.info(i.rb.src, i.p)
+       }
+       outp = i.outStart
+       for {
+               if !i.rb.insert(i.rb.src, i.p, i.info) {
+                       break
+               }
+               if i.p += int(i.info.size); i.p >= i.rb.nsrc {
+                       outp += i.rb.flushCopy(out[outp:])
+                       i.done = true
+                       return outp
+               }
+               i.info = i.rb.f.info(i.rb.src, i.p)
+               if i.info.ccc == 0 {
+                       break
+               }
+       }
+       // new segment or too many combining characters: exit normalization
+       if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) {
+               return outp
+       }
+       goto doFast
+}
+
+// nextComposed is the implementation of Next for forms NFC and NFKC.
+func nextComposed(i *Iter, out []byte) int {
+       var outp int
+       i.initNext(len(out), i.p)
+doFast:
+       inCopyStart, outCopyStart := i.p, outp // invariant xCopyStart <= i.xStart
+       var prevCC uint8
+       for {
+               if !i.info.isYesC() {
+                       goto doNorm
+               }
+               if cc := i.info.ccc; cc == 0 {
+                       if i.setStart(outp, i.p) {
+                               break
+                       }
+               } else if cc < prevCC {
+                       goto doNorm
+               }
+               prevCC = i.info.tccc
+               sz := int(i.info.size)
+               if sz == 0 {
+                       sz = 1 // illegal rune: copy byte-by-byte
+               }
+               p := outp + sz
+               if p > i.maxseg && i.setStart(outp, i.p) {
+                       break
+               }
+               outp = p
+               i.p += sz
+               max := min(i.rb.nsrc, len(out)-outp+i.p)
+               if np := i.rb.src.skipASCII(i.p, max); np > i.p {
+                       outp += np - i.p
+                       i.p = np
+                       if i.p >= i.rb.nsrc {
+                               break
+                       }
+                       // ASCII may combine with consecutive runes.
+                       if i.setStart(outp-1, i.p-1) {
+                               i.p--
+                               outp--
+                               i.info = runeInfo{size: 1}
+                               break
+                       }
+               }
+               if i.p >= i.rb.nsrc {
+                       break
+               }
+               i.info = i.rb.f.info(i.rb.src, i.p)
+       }
+       if inCopyStart != i.p {
+               i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.p)
+       }
+       i.done = i.p >= i.rb.nsrc
+       return outp
+doNorm:
+       i.rb.src.copySlice(out[outCopyStart:], inCopyStart, i.inStart)
+       outp, i.p = i.outStart, i.inStart
+       i.info = i.rb.f.info(i.rb.src, i.p)
+       for {
+               if !i.rb.insert(i.rb.src, i.p, i.info) {
+                       break
+               }
+               if i.p += int(i.info.size); i.p >= i.rb.nsrc {
+                       i.rb.compose()
+                       outp += i.rb.flushCopy(out[outp:])
+                       i.done = true
+                       return outp
+               }
+               i.info = i.rb.f.info(i.rb.src, i.p)
+               if i.info.boundaryBefore() {
+                       break
+               }
+       }
+       i.rb.compose()
+       if outp += i.rb.flushCopy(out[outp:]); i.setStart(outp, i.p) {
+               return outp
+       }
+       goto doFast
+}
diff --git a/src/pkg/exp/norm/iter_test.go b/src/pkg/exp/norm/iter_test.go

new file mode 100644 (file)

index 0000000..f6e8d81
--- /dev/null
+++ b/src/pkg/exp/norm/iter_test.go
@@ -0,0 +1,186 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package norm
+
+import (
+       "strings"
+       "testing"
+)
+
+var iterBufSizes = []int{
+       MaxSegmentSize,
+       1.5 * MaxSegmentSize,
+       2 * MaxSegmentSize,
+       3 * MaxSegmentSize,
+       100 * MaxSegmentSize,
+}
+
+func doIterNorm(f Form, buf []byte, s string) []byte {
+       acc := []byte{}
+       i := Iter{}
+       i.SetInputString(f, s)
+       for !i.Done() {
+               n := i.Next(buf)
+               acc = append(acc, buf[:n]...)
+       }
+       return acc
+}
+
+func runIterTests(t *testing.T, name string, f Form, tests []AppendTest, norm bool) {
+       for i, test := range tests {
+               in := test.left + test.right
+               gold := test.out
+               if norm {
+                       gold = string(f.AppendString(nil, test.out))
+               }
+               for _, sz := range iterBufSizes {
+                       buf := make([]byte, sz)
+                       out := string(doIterNorm(f, buf, in))
+                       if len(out) != len(gold) {
+                               const msg = "%s:%d:%d: length is %d; want %d"
+                               t.Errorf(msg, name, i, sz, len(out), len(gold))
+                       }
+                       if out != gold {
+                               // Find first rune that differs and show context.
+                               ir := []rune(out)
+                               ig := []rune(gold)
+                               for j := 0; j < len(ir) && j < len(ig); j++ {
+                                       if ir[j] == ig[j] {
+                                               continue
+                                       }
+                                       if j -= 3; j < 0 {
+                                               j = 0
+                                       }
+                                       for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
+                                               const msg = "%s:%d:%d: runeAt(%d) = %U; want %U"
+                                               t.Errorf(msg, name, i, sz, j, ir[j], ig[j])
+                                       }
+                                       break
+                               }
+                       }
+               }
+       }
+}
+
+func rep(r rune, n int) string {
+       return strings.Repeat(string(r), n)
+}
+
+var iterTests = []AppendTest{
+       {"", ascii, ascii},
+       {"", txt_all, txt_all},
+       {"", "a" + rep(0x0300, MaxSegmentSize/2), "a" + rep(0x0300, MaxSegmentSize/2)},
+}
+
+var iterTestsD = []AppendTest{
+       { // segment overflow on unchanged character
+               "",
+               "a" + rep(0x0300, MaxSegmentSize/2) + "\u0316",
+               "a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0316\u0300",
+       },
+       { // segment overflow on unchanged character + start value
+               "",
+               "a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars+4) + "\u0316",
+               "a" + rep(0x0300, MaxSegmentSize/2+maxCombiningChars) + "\u0316" + rep(0x300, 4),
+       },
+       { // segment overflow on decomposition
+               "",
+               "a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340",
+               "a" + rep(0x0300, MaxSegmentSize/2),
+       },
+       { // segment overflow on decomposition + start value
+               "",
+               "a" + rep(0x0300, MaxSegmentSize/2-1) + "\u0340" + rep(0x300, maxCombiningChars+4) + "\u0320",
+               "a" + rep(0x0300, MaxSegmentSize/2-1) + rep(0x300, maxCombiningChars+1) + "\u0320" + rep(0x300, 4),
+       },
+       { // start value after ASCII overflow
+               "",
+               rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars+2) + "\u0320",
+               rep('a', MaxSegmentSize) + rep(0x300, maxCombiningChars) + "\u0320\u0300\u0300",
+       },
+       { // start value after Hangul overflow
+               "",
+               rep(0xAC00, MaxSegmentSize/6) + rep(0x300, maxCombiningChars+2) + "\u0320",
+               strings.Repeat("\u1100\u1161", MaxSegmentSize/6) + rep(0x300, maxCombiningChars-1) + "\u0320" + rep(0x300, 3),
+       },
+       { // start value after cc=0
+               "",
+               "您您" + rep(0x300, maxCombiningChars+4) + "\u0320",
+               "您您" + rep(0x300, maxCombiningChars) + "\u0320" + rep(0x300, 4),
+       },
+       { // start value after normalization
+               "",
+               "\u0300\u0320a" + rep(0x300, maxCombiningChars+4) + "\u0320",
+               "\u0320\u0300a" + rep(0x300, maxCombiningChars) + "\u0320" + rep(0x300, 4),
+       },
+}
+
+var iterTestsC = []AppendTest{
+       { // ordering of non-composing combining characters
+               "",
+               "\u0305\u0316",
+               "\u0316\u0305",
+       },
+       { // segment overflow
+               "",
+               "a" + rep(0x0305, MaxSegmentSize/2+4) + "\u0316",
+               "a" + rep(0x0305, MaxSegmentSize/2-1) + "\u0316" + rep(0x305, 5),
+       },
+}
+
+func TestIterNextD(t *testing.T) {
+       runIterTests(t, "IterNextD1", NFKD, appendTests, true)
+       runIterTests(t, "IterNextD2", NFKD, iterTests, true)
+       runIterTests(t, "IterNextD3", NFKD, iterTestsD, false)
+}
+
+func TestIterNextC(t *testing.T) {
+       runIterTests(t, "IterNextC1", NFKC, appendTests, true)
+       runIterTests(t, "IterNextC2", NFKC, iterTests, true)
+       runIterTests(t, "IterNextC3", NFKC, iterTestsC, false)
+}
+
+type SegmentTest struct {
+       in  string
+       out []string
+}
+
+var segmentTests = []SegmentTest{
+       {rep('a', MaxSegmentSize), []string{rep('a', MaxSegmentSize), ""}},
+       {rep('a', MaxSegmentSize+2), []string{rep('a', MaxSegmentSize-1), "aaa", ""}},
+       {rep('a', MaxSegmentSize) + "\u0300aa", []string{rep('a', MaxSegmentSize-1), "a\u0300", "aa", ""}},
+}
+
+// Note that, by design, segmentation is equal for composing and decomposing forms.
+func TestIterSegmentation(t *testing.T) {
+       segmentTest(t, "SegmentTestD", NFD, segmentTests)
+       segmentTest(t, "SegmentTestC", NFC, segmentTests)
+}
+
+func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
+       iter := Iter{}
+       for i, tt := range segmentTests {
+               buf := make([]byte, MaxSegmentSize)
+               iter.SetInputString(f, tt.in)
+               for j, seg := range tt.out {
+                       if seg == "" {
+                               if !iter.Done() {
+                                       n := iter.Next(buf)
+                                       res := string(buf[:n])
+                                       t.Errorf(`%s:%d:%d: expected Done()==true, found segment "%s"`, name, i, j, res)
+                               }
+                               continue
+                       }
+                       if iter.Done() {
+                               t.Errorf("%s:%d:%d: Done()==true, want false", name, i, j)
+                       }
+                       n := iter.Next(buf)
+                       seg = f.String(seg)
+                       if res := string(buf[:n]); res != seg {
+                               t.Errorf(`%s:%d:%d" segment was "%s" (%d); want "%s" (%d)`, name, i, j, res, len(res), seg, len(seg))
+                       }
+               }
+       }
+}
diff --git a/src/pkg/exp/norm/normalize.go b/src/pkg/exp/norm/normalize.go

index 030d900918ed6873d1877c50f55654a5d203788d..b5cd44abfa0dc7af46366d1ff5e18bba7382236d 100644 (file)
--- a/src/pkg/exp/norm/normalize.go
+++ b/src/pkg/exp/norm/normalize.go
@@ -243,7 +243,7 @@ func quickSpan(rb *reorderBuffer, i int) int {
         lastSegStart := i
         src, n := rb.src, rb.nsrc
         for i < n {
-               if j := src.skipASCII(i); i != j {
+               if j := src.skipASCII(i, n); i != j {
                         i = j
                         lastSegStart = i - 1
                         lastCC = 0
@@ -448,11 +448,16 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
                 }
                 // Check that decomposition doesn't result in overflow.
                 if info.hasDecomposition() {
-                       dcomp := info.decomposition()
-                       for i := 0; i < len(dcomp); {
-                               inf := rb.f.info(inputBytes(dcomp), i)
-                               i += int(inf.size)
+                       if isHangul(buf) {
+                               i += int(info.size)
                                 n++
+                       } else {
+                               dcomp := info.decomposition()
+                               for i := 0; i < len(dcomp); {
+                                       inf := rb.f.info(inputBytes(dcomp), i)
+                                       i += int(inf.size)
+                                       n++
+                               }
                         }
                 } else {
                         n++
diff --git a/src/pkg/exp/norm/normalize_test.go b/src/pkg/exp/norm/normalize_test.go

index c7d5e08fca0a6321d7155a8f9fa057c39655d91a..8b970598b4d1faed388067d6bda157eb1ec5afb4 100644 (file)
--- a/src/pkg/exp/norm/normalize_test.go
+++ b/src/pkg/exp/norm/normalize_test.go
@@ -5,6 +5,7 @@
  package norm
  
  import (
+       "bytes"
         "strings"
         "testing"
  )
@@ -495,15 +496,40 @@ func TestAppend(t *testing.T) {
         runAppendTests(t, "TestString", NFKC, stringF, appendTests)
  }
  
+func appendBench(f Form, in []byte) func() {
+       buf := make([]byte, 0, 4*len(in))
+       return func() {
+               f.Append(buf, in...)
+       }
+}
+
+func iterBench(f Form, in []byte) func() {
+       buf := make([]byte, 4*len(in))
+       iter := Iter{}
+       return func() {
+               iter.SetInput(f, in)
+               for !iter.Done() {
+                       iter.Next(buf)
+               }
+       }
+}
+
+func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
+       //bm = append(bm, appendBench(f, in))
+       bm = append(bm, iterBench(f, in))
+       return bm
+}
+
  func doFormBenchmark(b *testing.B, inf, f Form, s string) {
         b.StopTimer()
         in := inf.Bytes([]byte(s))
-       buf := make([]byte, 2*len(in))
-       b.SetBytes(int64(len(in)))
+       bm := appendBenchmarks(nil, f, in)
+       b.SetBytes(int64(len(in) * len(bm)))
         b.StartTimer()
         for i := 0; i < b.N; i++ {
-               buf = f.Append(buf[0:0], in...)
-               buf = buf[0:0]
+               for _, fn := range bm {
+                       fn()
+               }
         }
  }
  
@@ -549,17 +575,21 @@ func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
         doFormBenchmark(b, NFD, NFD, txt_kr)
  }
  
+var forms = []Form{NFC, NFD, NFKC, NFKD}
+
  func doTextBenchmark(b *testing.B, s string) {
         b.StopTimer()
-       b.SetBytes(int64(len(s)) * 4)
         in := []byte(s)
-       var buf = make([]byte, 0, 2*len(in))
+       bm := []func(){}
+       for _, f := range forms {
+               bm = appendBenchmarks(bm, f, in)
+       }
+       b.SetBytes(int64(len(s) * len(bm)))
         b.StartTimer()
         for i := 0; i < b.N; i++ {
-               NFC.Append(buf, in...)
-               NFD.Append(buf, in...)
-               NFKC.Append(buf, in...)
-               NFKD.Append(buf, in...)
+               for _, f := range bm {
+                       f()
+               }
         }
  }
  
@@ -584,6 +614,11 @@ func BenchmarkJapanese(b *testing.B) {
  func BenchmarkChinese(b *testing.B) {
         doTextBenchmark(b, txt_cn)
  }
+func BenchmarkOverflow(b *testing.B) {
+       doTextBenchmark(b, overflow)
+}
+
+var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
  
  // Tests sampled from the Canonical ordering tests (Part 2) of
  // http://unicode.org/Public/UNIDATA/NormalizationTest.txt
diff --git a/src/pkg/exp/norm/normregtest.go b/src/pkg/exp/norm/normregtest.go

index c2ab25bc99dab9a7836b8ea2a040dafc7e637afc..507de1ae8340bed21f5aeb91285a569bb2189276 100644 (file)
--- a/src/pkg/exp/norm/normregtest.go
+++ b/src/pkg/exp/norm/normregtest.go
@@ -220,6 +220,17 @@ func cmpIsNormal(t *Test, name string, f norm.Form, test string, result, want bo
  func doTest(t *Test, f norm.Form, gold, test string) {
         result := f.Bytes([]byte(test))
         cmpResult(t, "Bytes", f, gold, test, string(result))
+       sresult := f.String(test)
+       cmpResult(t, "String", f, gold, test, sresult)
+       buf := make([]byte, norm.MaxSegmentSize)
+       acc := []byte{}
+       i := norm.Iter{}
+       i.SetInputString(f, test)
+       for !i.Done() {
+               n := i.Next(buf)
+               acc = append(acc, buf[:n]...)
+       }
+       cmpResult(t, "Iter.Next", f, gold, test, string(acc))
         for i := range test {
                 out := f.Append(f.Bytes([]byte(test[:i])), []byte(test[i:])...)
                 cmpResult(t, fmt.Sprintf(":Append:%d", i), f, gold, test, string(out))
author	Marcel van Lohuizen <mpvl@golang.org>
	Tue, 21 Feb 2012 12:13:21 +0000 (13:13 +0100)
committer	Marcel van Lohuizen <mpvl@golang.org>
	Tue, 21 Feb 2012 12:13:21 +0000 (13:13 +0100)
src/pkg/exp/norm/composition.go		patch \| blob \| history
src/pkg/exp/norm/composition_test.go		patch \| blob \| history
src/pkg/exp/norm/input.go		patch \| blob \| history
src/pkg/exp/norm/iter.go	[new file with mode: 0644]	patch \| blob
src/pkg/exp/norm/iter_test.go	[new file with mode: 0644]	patch \| blob
src/pkg/exp/norm/normalize.go		patch \| blob \| history
src/pkg/exp/norm/normalize_test.go		patch \| blob \| history
src/pkg/exp/norm/normregtest.go		patch \| blob \| history