]> Cypherpunks repositories - gostls13.git/commitdiff
strings: correctly handle invalid utf8 sequences in Map
authorMartin Möhrmann <moehrmann@google.com>
Sun, 26 Aug 2018 12:22:39 +0000 (14:22 +0200)
committerBrad Fitzpatrick <bradfitz@golang.org>
Wed, 3 Oct 2018 21:27:10 +0000 (21:27 +0000)
When an invalid UTF-8 byte sequence is decoded in a range loop over a string
a utf8.RuneError rune is returned. This is not distinguishable from decoding
the valid '\uFFFD' sequence representing utf8.RuneError from a string without
further checks within the range loop.

The previous Map code did not do any extra checks and would thereby not map
invalid UTF-8 byte sequences correctly when those were mapping to utf8.RuneError.

Fix this by adding the extra checks necessary to distinguish the decoding
of invalid utf8 byte sequences from decoding the sequence for utf8.RuneError
when the mapping of a rune is utf8.RuneError.

This fix does not result in a measureable performance regression:
name                old time/op  new time/op  delta
ByteByteMap         1.05µs ± 3%  1.03µs ± 3%   ~     (p=0.118 n=10+10)
Map/identity/ASCII   169ns ± 2%   170ns ± 1%   ~     (p=0.501 n=9+10)
Map/identity/Greek   298ns ± 1%   303ns ± 4%   ~     (p=0.338 n=10+10)
Map/change/ASCII     323ns ± 3%   325ns ± 4%   ~     (p=0.679 n=8+10)
Map/change/Greek     628ns ± 5%   635ns ± 1%   ~     (p=0.460 n=10+9)
MapNoChanges         120ns ± 4%   119ns ± 1%   ~     (p=0.496 n=10+9)

Fixes #26305

Change-Id: I70e99fa244983c5040756fa4549ac1e8cb6022c3
Reviewed-on: https://go-review.googlesource.com/c/131495
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
src/strings/strings.go
src/strings/strings_test.go

index 00200e4e24df07ea23b936b6561b3a94e0737831..ecc8c97d9eb46414ea5a52c09b75703006c6fbb3 100644 (file)
@@ -463,27 +463,27 @@ func Map(mapping func(rune) rune, s string) string {
 
        for i, c := range s {
                r := mapping(c)
-               if r == c {
+               if r == c && c != utf8.RuneError {
                        continue
                }
 
+               var width int
+               if c == utf8.RuneError {
+                       c, width = utf8.DecodeRuneInString(s[i:])
+                       if width != 1 && r == c {
+                               continue
+                       }
+               } else {
+                       width = utf8.RuneLen(c)
+               }
+
                b.Grow(len(s) + utf8.UTFMax)
                b.WriteString(s[:i])
                if r >= 0 {
                        b.WriteRune(r)
                }
 
-               if c == utf8.RuneError {
-                       // RuneError is the result of either decoding
-                       // an invalid sequence or '\uFFFD'. Determine
-                       // the correct number of bytes we need to advance.
-                       _, w := utf8.DecodeRuneInString(s[i:])
-                       i += w
-               } else {
-                       i += utf8.RuneLen(c)
-               }
-
-               s = s[i:]
+               s = s[i+width:]
                break
        }
 
index bb6a5b931b85d12f1fb2da136c38c0e8618406cc..eee2dd55dfd09eb886b8672cc3cc0ee6b1318462 100644 (file)
@@ -646,10 +646,10 @@ func TestMap(t *testing.T) {
                if unicode.Is(unicode.Latin, r) {
                        return r
                }
-               return '?'
+               return utf8.RuneError
        }
        m = Map(replaceNotLatin, "Hello\255World")
-       expect = "Hello?World"
+       expect = "Hello\uFFFDWorld"
        if m != expect {
                t.Errorf("replace invalid sequence: expected %q got %q", expect, m)
        }