]> Cypherpunks repositories - gostls13.git/commitdiff
utf8: add DecodeLastRune and DecodeLastRuneInString to
authorRoger Peppe <rogpeppe@gmail.com>
Thu, 23 Sep 2010 10:33:52 +0000 (20:33 +1000)
committerRob Pike <r@golang.org>
Thu, 23 Sep 2010 10:33:52 +0000 (20:33 +1000)
enable traversing rune-by-rune backwards in strings

R=r, rsc
CC=golang-dev
https://golang.org/cl/2192050

src/pkg/utf8/utf8.go
src/pkg/utf8/utf8_test.go

index 8e373e32d1fe7d4d00d2aa01b7d0ec1b7fd5500a..dfcdef9613b0196fca7692703100dd9cf344854b 100644 (file)
@@ -209,6 +209,73 @@ func DecodeRuneInString(s string) (rune, size int) {
        return
 }
 
+// DecodeLastRune unpacks the last UTF-8 encoding in p
+// and returns the rune and its width in bytes.
+func DecodeLastRune(p []byte) (rune, size int) {
+       end := len(p)
+       if end == 0 {
+               return RuneError, 0
+       }
+       start := end - 1
+       rune = int(p[start])
+       if rune < RuneSelf {
+               return rune, 1
+       }
+       // guard against O(n^2) behavior when traversing
+       // backwards through strings with long sequences of
+       // invalid UTF-8.
+       lim := end - UTFMax
+       if lim < 0 {
+               lim = 0
+       }
+       for start--; start >= lim; start-- {
+               if RuneStart(p[start]) {
+                       break
+               }
+       }
+       if start < 0 {
+               start = 0
+       }
+       rune, size = DecodeRune(p[start:end])
+       if start+size != end {
+               return RuneError, 1
+       }
+       return rune, size
+}
+
+// DecodeLastRuneInString is like DecodeLastRune but its input is a string.
+func DecodeLastRuneInString(s string) (rune, size int) {
+       end := len(s)
+       if end == 0 {
+               return RuneError, 0
+       }
+       start := end - 1
+       rune = int(s[start])
+       if rune < RuneSelf {
+               return rune, 1
+       }
+       // guard against O(n^2) behavior when traversing
+       // backwards through strings with long sequences of
+       // invalid UTF-8.
+       lim := end - UTFMax
+       if lim < 0 {
+               lim = 0
+       }
+       for start--; start >= lim; start-- {
+               if RuneStart(s[start]) {
+                       break
+               }
+       }
+       if start < 0 {
+               start = 0
+       }
+       rune, size = DecodeRuneInString(s[start:end])
+       if start+size != end {
+               return RuneError, 1
+       }
+       return rune, size
+}
+
 // RuneLen returns the number of bytes required to encode the rune.
 func RuneLen(rune int) int {
        switch {
index 2466cf554be71df05252d03a0b154c4c91aef578..45c5ad3f8fae678d8d049f67a7c9c824e7c0b0b0 100644 (file)
@@ -44,6 +44,12 @@ var utf8map = []Utf8Map{
        Utf8Map{0xFFFD, "\xef\xbf\xbd"},
 }
 
+var testStrings = []string{
+       "",
+       "abcd",
+       "\x80\x80\x80\x80",
+}
+
 // strings.Bytes with one extra byte at end
 func makeBytes(s string) []byte {
        s += "\x00"
@@ -141,6 +147,79 @@ func TestDecodeRune(t *testing.T) {
                if rune != RuneError || size != 1 {
                        t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, rune, size, RuneError, 1)
                }
+
+       }
+}
+
+// Check that DecodeRune and DecodeLastRune correspond to
+// the equivalent range loop.
+func TestSequencing(t *testing.T) {
+       for _, ts := range testStrings {
+               for _, m := range utf8map {
+                       for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
+                               testSequence(t, s)
+                       }
+               }
+       }
+}
+
+func testSequence(t *testing.T, s string) {
+       type info struct {
+               index int
+               rune  int
+       }
+       index := make([]info, len(s))
+       b := []byte(s)
+       si := 0
+       j := 0
+       for i, r := range s {
+               if si != i {
+                       t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
+                       return
+               }
+               index[j] = info{i, r}
+               j++
+               rune1, size1 := DecodeRune(b[i:])
+               if r != rune1 {
+                       t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], rune1, r)
+                       return
+               }
+               rune2, size2 := DecodeRuneInString(s[i:])
+               if r != rune2 {
+                       t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], rune2, r)
+                       return
+               }
+               if size1 != size2 {
+                       t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
+                       return
+               }
+               si += size1
+       }
+       j--
+       for si = len(s); si > 0; {
+               rune1, size1 := DecodeLastRune(b[0:si])
+               rune2, size2 := DecodeLastRuneInString(s[0:si])
+               if size1 != size2 {
+                       t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
+                       return
+               }
+               if rune1 != index[j].rune {
+                       t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, rune1, index[j].rune)
+                       return
+               }
+               if rune2 != index[j].rune {
+                       t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, rune2, index[j].rune)
+                       return
+               }
+               si -= size1
+               if si != index[j].index {
+                       t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
+                       return
+               }
+               j--
+       }
+       if si != 0 {
+               t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
        }
 }