From: Joe Tsai Date: Mon, 23 Oct 2017 20:47:15 +0000 (-0700) Subject: archive/zip: restrict UTF-8 detection for comment and name fields X-Git-Tag: go1.10beta1~587 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=78805c07f4119ba0fc7bf2e462451d519f42f679;p=gostls13.git archive/zip: restrict UTF-8 detection for comment and name fields CL 39570 added support for automatically setting flag bit 11 to indicate that the filename and comment fields are encoded in UTF-8, which is (conventionally) the encoding using for most Go strings. However, the detection added is too lose for two reasons: * We need to ensure both fields are at least possibly UTF-8. That is, if any field is definitely not UTF-8, then we can't set the bit. * The utf8.ValidRune returns true for utf8.RuneError, which iterating over a Go string automatically returns for invalid UTF-8. Thus, we manually check for that value. Updates #22367 Updates #10741 Change-Id: Ie8aae388432e546e44c6bebd06a00434373ca99e Reviewed-on: https://go-review.googlesource.com/72791 Reviewed-by: Ian Lance Taylor Run-TryBot: Joe Tsai TryBot-Result: Gobot Gobot --- diff --git a/src/archive/zip/writer.go b/src/archive/zip/writer.go index 1aca8518ca..53fc19c590 100644 --- a/src/archive/zip/writer.go +++ b/src/archive/zip/writer.go @@ -215,18 +215,20 @@ func (w *Writer) Create(name string) (io.Writer, error) { return w.CreateHeader(header) } -func hasValidUTF8(s string) bool { - n := 0 +// detectUTF8 reports whether s is a valid UTF-8 string, and whether the string +// must be considered UTF-8 encoding (i.e., not compatible with CP-437). +func detectUTF8(s string) (valid, require bool) { for _, r := range s { - // By default, ZIP uses CP437, which is only identical to ASCII for the printable characters. + // By default, ZIP uses CP-437, + // which is only identical to ASCII for the printable characters. if r < 0x20 || r >= 0x7f { - if !utf8.ValidRune(r) { - return false + if !utf8.ValidRune(r) || r == utf8.RuneError { + return false, false } - n++ + require = true } } - return n > 0 + return true, require } // CreateHeader adds a file to the zip file using the provided FileHeader @@ -249,8 +251,29 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) { fh.Flags |= 0x8 // we will write a data descriptor - if hasValidUTF8(fh.Name) || hasValidUTF8(fh.Comment) { - fh.Flags |= 0x800 // filename or comment have valid utf-8 string + // The ZIP format has a sad state of affairs regarding character encoding. + // Officially, the name and comment fields are supposed to be encoded + // in CP-437 (which is mostly compatible with ASCII), unless the UTF-8 + // flag bit is set. However, there are several problems: + // + // * Many ZIP readers still do not support UTF-8. + // * If the UTF-8 flag is cleared, several readers simply interpret the + // name and comment fields as whatever the local system encoding is. + // + // In order to avoid breaking readers without UTF-8 support, + // we avoid setting the UTF-8 flag if the strings are CP-437 compatible. + // However, if the strings require multibyte UTF-8 encoding and is a + // valid UTF-8 string, then we set the UTF-8 bit. + // + // For the case, where the user explicitly wants to specify the encoding + // as UTF-8, they will need to set the flag bit themselves. + // TODO: For the case, where the user explicitly wants to specify that the + // encoding is *not* UTF-8, that is currently not possible. + // See golang.org/issue/10741. + utf8Valid1, utf8Require1 := detectUTF8(fh.Name) + utf8Valid2, utf8Require2 := detectUTF8(fh.Comment) + if (utf8Require1 || utf8Require2) && utf8Valid1 && utf8Valid2 { + fh.Flags |= 0x800 } fh.CreatorVersion = fh.CreatorVersion&0xff00 | zipVersion20 // preserve compatibility byte diff --git a/src/archive/zip/writer_test.go b/src/archive/zip/writer_test.go index 8db159f232..3072f60027 100644 --- a/src/archive/zip/writer_test.go +++ b/src/archive/zip/writer_test.go @@ -156,6 +156,12 @@ func TestWriterUTF8(t *testing.T) { comment: "in the 世界", expect: 0x808, }, + { + // Name is Japanese encoded in Shift JIS. + name: "\x93\xfa\x96{\x8c\xea.txt", + comment: "in the 世界", + expect: 0x008, // UTF-8 must not be set + }, } // write a zip file