From 0d1cbaf22524113eb49347c1194084c572e5a003 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Thu, 4 Dec 2008 21:00:34 -0800 Subject: [PATCH] strings.utflen -> utf8.RuneCount, RuneCountInString R=r DELTA=94 (52 added, 33 deleted, 9 changed) OCL=20547 CL=20552 --- src/lib/Makefile | 1 + src/lib/strings.go | 29 ++++++++--------------------- src/lib/strings_test.go | 18 ------------------ src/lib/utf8.go | 35 +++++++++++++++++++++++++++++++---- src/lib/utf8_test.go | 22 ++++++++++++++++++++++ 5 files changed, 62 insertions(+), 43 deletions(-) diff --git a/src/lib/Makefile b/src/lib/Makefile index 7079433897..b920aa6198 100644 --- a/src/lib/Makefile +++ b/src/lib/Makefile @@ -88,6 +88,7 @@ bignum.6: fmt.dirinstall bufio.6: io.dirinstall os.dirinstall flag.6: fmt.dirinstall testing.6: flag.install fmt.dirinstall +strings.6: utf8.install fmt.dirinstall: io.dirinstall reflect.dirinstall strconv.dirinstall hash.dirinstall: os.dirinstall diff --git a/src/lib/strings.go b/src/lib/strings.go index 433e500640..c171214db2 100644 --- a/src/lib/strings.go +++ b/src/lib/strings.go @@ -4,30 +4,17 @@ package strings -// Count UTF-8 sequences in s. -// Assumes s is well-formed. -export func utflen(s string) int { - n := 0; - for i := 0; i < len(s); i++ { - if s[i]&0xC0 != 0x80 { - n++ - } - } - return n -} +import "utf8" // Split string into array of UTF-8 sequences (still strings) export func explode(s string) *[]string { - a := new([]string, utflen(s)); + a := new([]string, utf8.RuneCountInString(s, 0, len(s))); j := 0; + var size, rune int; for i := 0; i < len(a); i++ { - ej := j; - ej++; - for ej < len(s) && (s[ej]&0xC0) == 0x80 { - ej++ - } - a[i] = s[j:ej]; - j = ej + rune, size = utf8.DecodeRuneInString(s, j); + a[i] = string(rune); + j += size; } return a } @@ -35,7 +22,7 @@ export func explode(s string) *[]string { // Count non-overlapping instances of sep in s. export func count(s, sep string) int { if sep == "" { - return utflen(s)+1 + return utf8.RuneCountInString(s, 0, len(s))+1 } c := sep[0]; n := 0; @@ -83,7 +70,7 @@ export func split(s, sep string) *[]string { a[na] = s[start:len(s)]; return a } - + // Join list of strings with separators between them. export func join(a *[]string, sep string) string { if len(a) == 0 { diff --git a/src/lib/strings_test.go b/src/lib/strings_test.go index a7b63738b5..50ad30cc36 100644 --- a/src/lib/strings_test.go +++ b/src/lib/strings_test.go @@ -79,21 +79,3 @@ export func TestSplit(t *testing.T) { } } -// TODO: utflen shouldn't even be in strings. -type UtflenTest struct { - in string; - out int; -} -var utflentests = []UtflenTest { - UtflenTest{ abcd, 4 }, - UtflenTest{ faces, 3 }, - UtflenTest{ commas, 7 }, -} -export func TestUtflen(t *testing.T) { - for i := 0; i < len(utflentests); i++ { - tt := utflentests[i]; - if out := strings.utflen(tt.in); out != tt.out { - t.Errorf("utflen(%q) = %d, want %d", tt.in, out, tt.out); - } - } -} diff --git a/src/lib/utf8.go b/src/lib/utf8.go index 9ece25f6a5..7c1c8fbe5a 100644 --- a/src/lib/utf8.go +++ b/src/lib/utf8.go @@ -107,8 +107,7 @@ func DecodeRuneInternal(p *[]byte) (rune, size int, short bool) { return RuneError, 1, false } -func DecodeRuneInStringInternal(s string, i int) (rune, size int, short bool) { - n := len(s) - i; +func DecodeRuneInStringInternal(s string, i int, n int) (rune, size int, short bool) { if n < 1 { return RuneError, 0, true; } @@ -188,7 +187,7 @@ export func FullRune(p *[]byte) bool { } export func FullRuneInString(s string, i int) bool { - rune, size, short := DecodeRuneInStringInternal(s, i); + rune, size, short := DecodeRuneInStringInternal(s, i, len(s) - i); return !short } @@ -200,7 +199,7 @@ export func DecodeRune(p *[]byte) (rune, size int) { export func DecodeRuneInString(s string, i int) (rune, size int) { var short bool; - rune, size, short = DecodeRuneInStringInternal(s, i); + rune, size, short = DecodeRuneInStringInternal(s, i, len(s) - i); return; } @@ -248,3 +247,31 @@ export func EncodeRune(rune int, p *[]byte) int { return 4; } +export func RuneCount(p *[]byte) int { + i := 0; + var n int; + for n = 0; i < len(p); n++ { + if p[i] < RuneSelf { + i++; + } else { + rune, size := DecodeRune(p[i:len(p)]); + i += size; + } + } + return n; +} + +export func RuneCountInString(s string, i int, l int) int { + ei := i + l; + n := 0; + for n = 0; i < ei; n++ { + if s[i] < RuneSelf { + i++; + } else { + rune, size, short := DecodeRuneInStringInternal(s, i, ei - i); + i += size; + } + } + return n; +} + diff --git a/src/lib/utf8_test.go b/src/lib/utf8_test.go index 18c06c2ce5..31118dd30c 100644 --- a/src/lib/utf8_test.go +++ b/src/lib/utf8_test.go @@ -156,3 +156,25 @@ export func TestDecodeRune(t *testing.T) { } } } + +type RuneCountTest struct { + in string; + out int; +} +var runecounttests = []RuneCountTest { + RuneCountTest{ "abcd", 4 }, + RuneCountTest{ "☺☻☹", 3 }, + RuneCountTest{ "1,2,3,4", 7 }, + RuneCountTest{ "\xe2\x00", 2 }, +} +export func TestRuneCount(t *testing.T) { + for i := 0; i < len(runecounttests); i++ { + tt := runecounttests[i]; + if out := utf8.RuneCountInString(tt.in, 0, len(tt.in)); out != tt.out { + t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out); + } + if out := utf8.RuneCount(Bytes(tt.in)); out != tt.out { + t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out); + } + } +} -- 2.48.1