From: Roger Peppe Date: Thu, 23 Sep 2010 10:33:52 +0000 (+1000) Subject: utf8: add DecodeLastRune and DecodeLastRuneInString to X-Git-Tag: weekly.2010-09-29~80 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=f11271b82e123239e9263749b18b3ea9ad5c0610;p=gostls13.git utf8: add DecodeLastRune and DecodeLastRuneInString to enable traversing rune-by-rune backwards in strings R=r, rsc CC=golang-dev https://golang.org/cl/2192050 --- diff --git a/src/pkg/utf8/utf8.go b/src/pkg/utf8/utf8.go index 8e373e32d1..dfcdef9613 100644 --- a/src/pkg/utf8/utf8.go +++ b/src/pkg/utf8/utf8.go @@ -209,6 +209,73 @@ func DecodeRuneInString(s string) (rune, size int) { return } +// DecodeLastRune unpacks the last UTF-8 encoding in p +// and returns the rune and its width in bytes. +func DecodeLastRune(p []byte) (rune, size int) { + end := len(p) + if end == 0 { + return RuneError, 0 + } + start := end - 1 + rune = int(p[start]) + if rune < RuneSelf { + return rune, 1 + } + // guard against O(n^2) behavior when traversing + // backwards through strings with long sequences of + // invalid UTF-8. + lim := end - UTFMax + if lim < 0 { + lim = 0 + } + for start--; start >= lim; start-- { + if RuneStart(p[start]) { + break + } + } + if start < 0 { + start = 0 + } + rune, size = DecodeRune(p[start:end]) + if start+size != end { + return RuneError, 1 + } + return rune, size +} + +// DecodeLastRuneInString is like DecodeLastRune but its input is a string. +func DecodeLastRuneInString(s string) (rune, size int) { + end := len(s) + if end == 0 { + return RuneError, 0 + } + start := end - 1 + rune = int(s[start]) + if rune < RuneSelf { + return rune, 1 + } + // guard against O(n^2) behavior when traversing + // backwards through strings with long sequences of + // invalid UTF-8. + lim := end - UTFMax + if lim < 0 { + lim = 0 + } + for start--; start >= lim; start-- { + if RuneStart(s[start]) { + break + } + } + if start < 0 { + start = 0 + } + rune, size = DecodeRuneInString(s[start:end]) + if start+size != end { + return RuneError, 1 + } + return rune, size +} + // RuneLen returns the number of bytes required to encode the rune. func RuneLen(rune int) int { switch { diff --git a/src/pkg/utf8/utf8_test.go b/src/pkg/utf8/utf8_test.go index 2466cf554b..45c5ad3f8f 100644 --- a/src/pkg/utf8/utf8_test.go +++ b/src/pkg/utf8/utf8_test.go @@ -44,6 +44,12 @@ var utf8map = []Utf8Map{ Utf8Map{0xFFFD, "\xef\xbf\xbd"}, } +var testStrings = []string{ + "", + "abcd", + "\x80\x80\x80\x80", +} + // strings.Bytes with one extra byte at end func makeBytes(s string) []byte { s += "\x00" @@ -141,6 +147,79 @@ func TestDecodeRune(t *testing.T) { if rune != RuneError || size != 1 { t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, rune, size, RuneError, 1) } + + } +} + +// Check that DecodeRune and DecodeLastRune correspond to +// the equivalent range loop. +func TestSequencing(t *testing.T) { + for _, ts := range testStrings { + for _, m := range utf8map { + for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} { + testSequence(t, s) + } + } + } +} + +func testSequence(t *testing.T, s string) { + type info struct { + index int + rune int + } + index := make([]info, len(s)) + b := []byte(s) + si := 0 + j := 0 + for i, r := range s { + if si != i { + t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i) + return + } + index[j] = info{i, r} + j++ + rune1, size1 := DecodeRune(b[i:]) + if r != rune1 { + t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], rune1, r) + return + } + rune2, size2 := DecodeRuneInString(s[i:]) + if r != rune2 { + t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], rune2, r) + return + } + if size1 != size2 { + t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2) + return + } + si += size1 + } + j-- + for si = len(s); si > 0; { + rune1, size1 := DecodeLastRune(b[0:si]) + rune2, size2 := DecodeLastRuneInString(s[0:si]) + if size1 != size2 { + t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2) + return + } + if rune1 != index[j].rune { + t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, rune1, index[j].rune) + return + } + if rune2 != index[j].rune { + t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, rune2, index[j].rune) + return + } + si -= size1 + if si != index[j].index { + t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index) + return + } + j-- + } + if si != 0 { + t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si) } }