utf8: Add new type String to automate string indexing by code point.

author Rob Pike <r@golang.org>

Fri, 24 Sep 2010 20:58:34 +0000 (06:58 +1000)

committer Rob Pike <r@golang.org>

Fri, 24 Sep 2010 20:58:34 +0000 (06:58 +1000)
author Rob Pike <r@golang.org>
Fri, 24 Sep 2010 20:58:34 +0000 (06:58 +1000)
committer Rob Pike <r@golang.org>
Fri, 24 Sep 2010 20:58:34 +0000 (06:58 +1000)
diff --git a/src/pkg/utf8/Makefile b/src/pkg/utf8/Makefile

index df69486dc3f42c744b7f9f43da32d5bd04a9fb77..b3574ba3b410c45143e69374309c9e2433c20fa4 100644 (file)
--- a/src/pkg/utf8/Makefile
+++ b/src/pkg/utf8/Makefile
@@ -6,6 +6,7 @@ include ../../Make.inc
  
  TARG=utf8
  GOFILES=\
+       string.go\
         utf8.go\
  
  include ../../Make.pkg
diff --git a/src/pkg/utf8/string.go b/src/pkg/utf8/string.go

new file mode 100644 (file)

index 0000000..ce74f72
--- /dev/null
+++ b/src/pkg/utf8/string.go
@@ -0,0 +1,166 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8
+
+// String wraps a regular string with a small structure that provides more
+// efficient indexing by code point index, as opposed to byte index.
+// Scanning incrementally forwards or backwards is O(1) per index operation
+// (although not as fast a range clause going forwards).  Random access is
+// O(N) in the length of the string, but the overhead is less than always
+// scanning from the beginning.
+// If the string is ASCII, random access is O(1).
+type String struct {
+       str      string
+       numRunes int
+       // If width > 0, the rune at runePos starts at bytePos and has the specified width.
+       width    int
+       bytePos  int
+       runePos  int
+       nonASCII int // byte index of the first non-ASCII rune.
+}
+
+// NewString returns a new UTF-8 string with the provided contents.
+func NewString(contents string) *String {
+       for i := 0; i < len(contents); i++ {
+               if contents[i] >= RuneSelf {
+                       // Not ASCII.
+                       _, wid := DecodeRuneInString(contents)
+                       return &String{
+                               str:      contents,
+                               numRunes: RuneCountInString(contents),
+                               width:    wid,
+                               nonASCII: i,
+                       }
+               }
+       }
+       // ASCII is simple.  Also, the empty string is ASCII.
+       return &String{str: contents, numRunes: len(contents), nonASCII: len(contents)}
+}
+
+// String returns the contents of the String.  This method also means the
+// String is directly printable by fmt.Print.
+func (s *String) String() string {
+       return s.str
+}
+
+// RuneCount returns the number of runes (Unicode code points) in the String.
+func (s *String) RuneCount() int {
+       return s.numRunes
+}
+
+// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
+func (s *String) IsASCII() bool {
+       return s.width == 0
+}
+
+// At returns the rune with index i in the String.  The sequence of runes is the same
+// as iterating over the contents with a "for range" clause.
+func (s *String) At(i int) int {
+       // ASCII is easy.  Let the compiler catch the indexing error if there is one.
+       if i < s.nonASCII {
+               return int(s.str[i])
+       }
+
+       // Now we do need to know the index is valid.
+       if i < 0 || i >= s.numRunes {
+               panic(outOfRange)
+       }
+
+       var rune int
+
+       // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
+       // With these cases, all scans from beginning or end work in O(1) time per rune.
+       switch {
+
+       case i == s.runePos-1: // backing up one rune
+               rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos])
+               s.runePos = i
+               s.bytePos -= s.width
+               return rune
+       case i == s.runePos+1: // moving ahead one rune
+               s.runePos = i
+               s.bytePos += s.width
+               fallthrough
+       case i == s.runePos:
+               rune, s.width = DecodeRuneInString(s.str[s.bytePos:])
+               return rune
+       case i == 0: // start of string
+               rune, s.width = DecodeRuneInString(s.str)
+               s.runePos = 0
+               s.bytePos = 0
+               return rune
+
+       case i == s.numRunes-1: // last rune in string
+               rune, s.width = DecodeLastRuneInString(s.str)
+               s.runePos = i
+               s.bytePos = len(s.str) - s.width
+               return rune
+       }
+
+       // We need to do a linear scan.  There are three places to start from:
+       // 1) The beginning
+       // 2) bytePos/runePos.
+       // 3) The end
+       // Choose the closest in rune count, scanning backwards if necessary.
+       forward := true
+       if i < s.runePos {
+               // Between beginning and pos.  Which is closer?
+               // Since both i and runePos are guaranteed >= nonASCII, that's the
+               // lowest location we need to start from.
+               if i < (s.runePos-s.nonASCII)/2 {
+                       // Scan forward from beginning
+                       s.bytePos, s.runePos = s.nonASCII, s.nonASCII
+               } else {
+                       // Scan backwards from where we are
+                       forward = false
+               }
+       } else {
+               // Between pos and end.  Which is closer?
+               if i-s.runePos < (s.numRunes-s.runePos)/2 {
+                       // Scan forward from pos
+               } else {
+                       // Scan backwards from end
+                       s.bytePos, s.runePos = len(s.str), s.numRunes
+                       forward = false
+               }
+       }
+       if forward {
+               // TODO: Is it much faster to use a range loop for this scan?
+               for {
+                       rune, s.width = DecodeRuneInString(s.str[s.bytePos:])
+                       if s.runePos == i {
+                               break
+                       }
+                       s.runePos++
+                       s.bytePos += s.width
+               }
+       } else {
+               for {
+                       rune, s.width = DecodeLastRuneInString(s.str[0:s.bytePos])
+                       s.runePos--
+                       s.bytePos -= s.width
+                       if s.runePos == i {
+                               break
+                       }
+               }
+       }
+       return rune
+}
+
+// We want the panic in At(i) to satisfy os.Error, because that's what
+// runtime panics satisfy, but we can't import os.  This is our solution.
+
+// error is the type of the error returned if a user calls String.At(i) with i out of range.
+// It satisfies os.Error and runtime.Error.
+type error string
+
+func (err error) String() string {
+       return string(err)
+}
+
+func (err error) RunTimeError() {
+}
+
+var outOfRange = error("utf8.String: index out of Range")
diff --git a/src/pkg/utf8/string_test.go b/src/pkg/utf8/string_test.go

new file mode 100644 (file)

index 0000000..7ce8f8a
--- /dev/null
+++ b/src/pkg/utf8/string_test.go
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8_test
+
+import (
+       "rand"
+       "testing"
+       . "utf8"
+)
+
+func TestScanForwards(t *testing.T) {
+       for _, s := range testStrings {
+               runes := []int(s)
+               str := NewString(s)
+               if str.RuneCount() != len(runes) {
+                       t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+                       break
+               }
+               for i, expect := range runes {
+                       got := str.At(i)
+                       if got != expect {
+                               t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
+                       }
+               }
+       }
+}
+
+func TestScanBackwards(t *testing.T) {
+       for _, s := range testStrings {
+               runes := []int(s)
+               str := NewString(s)
+               if str.RuneCount() != len(runes) {
+                       t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+                       break
+               }
+               for i := len(runes) - 1; i >= 0; i-- {
+                       expect := runes[i]
+                       got := str.At(i)
+                       if got != expect {
+                               t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
+                       }
+               }
+       }
+}
+
+const randCount = 100000
+
+func TestRandomAccess(t *testing.T) {
+       for _, s := range testStrings {
+               if len(s) == 0 {
+                       continue
+               }
+               runes := []int(s)
+               str := NewString(s)
+               if str.RuneCount() != len(runes) {
+                       t.Error("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+                       break
+               }
+               for j := 0; j < randCount; j++ {
+                       i := rand.Intn(len(runes))
+                       expect := runes[i]
+                       got := str.At(i)
+                       if got != expect {
+                               t.Errorf("%s[%d]: expected %c (U+%04x); got %c (U+%04x)", s, i, expect, expect, got, got)
+                       }
+               }
+       }
+}
diff --git a/src/pkg/utf8/utf8_test.go b/src/pkg/utf8/utf8_test.go

index 45c5ad3f8fae678d8d049f67a7c9c824e7c0b0b0..92e6bd3aabb1601ab8c6cfb39060a1f0ed0c54f1 100644 (file)
--- a/src/pkg/utf8/utf8_test.go
+++ b/src/pkg/utf8/utf8_test.go
@@ -47,20 +47,16 @@ var utf8map = []Utf8Map{
  var testStrings = []string{
         "",
         "abcd",
+       "☺☻☹",
+       "日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+       "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
         "\x80\x80\x80\x80",
  }
  
-// strings.Bytes with one extra byte at end
-func makeBytes(s string) []byte {
-       s += "\x00"
-       b := []byte(s)
-       return b[0 : len(s)-1]
-}
-
  func TestFullRune(t *testing.T) {
         for i := 0; i < len(utf8map); i++ {
                 m := utf8map[i]
-               b := makeBytes(m.str)
+               b := []byte(m.str)
                 if !FullRune(b) {
                         t.Errorf("FullRune(%q) (rune %04x) = false, want true", b, m.rune)
                 }
@@ -82,7 +78,7 @@ func TestFullRune(t *testing.T) {
  func TestEncodeRune(t *testing.T) {
         for i := 0; i < len(utf8map); i++ {
                 m := utf8map[i]
-               b := makeBytes(m.str)
+               b := []byte(m.str)
                 var buf [10]byte
                 n := EncodeRune(m.rune, buf[0:])
                 b1 := buf[0:n]
@@ -95,7 +91,7 @@ func TestEncodeRune(t *testing.T) {
  func TestDecodeRune(t *testing.T) {
         for i := 0; i < len(utf8map); i++ {
                 m := utf8map[i]
-               b := makeBytes(m.str)
+               b := []byte(m.str)
                 rune, size := DecodeRune(b)
                 if rune != m.rune || size != len(b) {
                         t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, rune, size, m.rune, len(b))
@@ -163,6 +159,26 @@ func TestSequencing(t *testing.T) {
         }
  }
  
+// Check that a range loop and a []int conversion visit the same runes.
+// Not really a test of this package, but the assumption is used here and
+// it's good to verify
+func TestIntConversion(t *testing.T) {
+       for _, ts := range testStrings {
+               runes := []int(ts)
+               if RuneCountInString(ts) != len(runes) {
+                       t.Error("%q: expected %d runes; got %d", ts, len(runes), RuneCountInString(ts))
+                       break
+               }
+               i := 0
+               for _, r := range ts {
+                       if r != runes[i] {
+                               t.Errorf("%q[%d]: expected %c (U+%04x); got %c (U+%04x)", ts, i, runes[i], runes[i], r, r)
+                       }
+                       i++
+               }
+       }
+}
+
  func testSequence(t *testing.T, s string) {
         type info struct {
                 index int
@@ -252,7 +268,7 @@ func TestRuneCount(t *testing.T) {
                 if out := RuneCountInString(tt.in); out != tt.out {
                         t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
                 }
-               if out := RuneCount(makeBytes(tt.in)); out != tt.out {
+               if out := RuneCount([]byte(tt.in)); out != tt.out {
                         t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
                 }
         }
author	Rob Pike <r@golang.org>
	Fri, 24 Sep 2010 20:58:34 +0000 (06:58 +1000)
committer	Rob Pike <r@golang.org>
	Fri, 24 Sep 2010 20:58:34 +0000 (06:58 +1000)
src/pkg/utf8/Makefile		patch \| blob \| history
src/pkg/utf8/string.go	[new file with mode: 0644]	patch \| blob
src/pkg/utf8/string_test.go	[new file with mode: 0644]	patch \| blob
src/pkg/utf8/utf8_test.go		patch \| blob \| history