]> Cypherpunks repositories - gostls13.git/commitdiff
archive/zip: add FileHeader.NonUTF8 field
authorJoe Tsai <joetsai@digital-static.net>
Thu, 2 Nov 2017 20:53:16 +0000 (13:53 -0700)
committerJoe Tsai <thebrokentoaster@gmail.com>
Mon, 6 Nov 2017 21:35:59 +0000 (21:35 +0000)
The NonUTF8 field provides users with a way to explictly tell the
ZIP writer to avoid setting the UTF-8 flag.
This is necessary because many readers:
1) (Still) do not support UTF-8
2) And use the local system encoding instead

Thus, even though character encodings other than CP-437 and UTF-8
are not officially supported by the ZIP specification, pragmatically
the world has permitted use of them.

When a non-standard encoding is used, it is the user's responsibility
to ensure that the target system is expecting the encoding used
(e.g., producing a ZIP file you know is used on a Chinese version of Windows).

We adjust the detectUTF8 function to account for Shift-JIS and EUC-KR
not being identical to ASCII for two characters.

We don't need an API for users to explicitly specify that they are encoding
with UTF-8 since all single byte characters are compatible with all other
common encodings (Windows-1256, Windows-1252, Windows-1251, Windows-1250,
IEC-8859, EUC-KR, KOI8-R, Latin-1, Shift-JIS, GB-2312, GBK) except for
the non-printable characters and the backslash character (all of which
are invalid characters in a path name anyways).

Fixes #10741

Change-Id: I9004542d1d522c9137973f1b6e2b623fa54dfd66
Reviewed-on: https://go-review.googlesource.com/75592
Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
src/archive/zip/reader.go
src/archive/zip/reader_test.go
src/archive/zip/struct.go
src/archive/zip/testdata/utf8-7zip.zip [new file with mode: 0644]
src/archive/zip/testdata/utf8-infozip.zip [new file with mode: 0644]
src/archive/zip/testdata/utf8-osx.zip [new file with mode: 0644]
src/archive/zip/testdata/utf8-winrar.zip [new file with mode: 0644]
src/archive/zip/testdata/utf8-winzip.zip [new file with mode: 0644]
src/archive/zip/writer.go
src/archive/zip/writer_test.go

index ae01786386215c40887685a2240c66337c410ab7..7417b8f36afc98751a5ccd131f40f9ecf3e4949c 100644 (file)
@@ -281,6 +281,24 @@ func readDirectoryHeader(f *File, r io.Reader) error {
        f.Extra = d[filenameLen : filenameLen+extraLen]
        f.Comment = string(d[filenameLen+extraLen:])
 
+       // Determine the character encoding.
+       utf8Valid1, utf8Require1 := detectUTF8(f.Name)
+       utf8Valid2, utf8Require2 := detectUTF8(f.Comment)
+       switch {
+       case !utf8Valid1 || !utf8Valid2:
+               // Name and Comment definitely not UTF-8.
+               f.NonUTF8 = true
+       case !utf8Require1 && !utf8Require2:
+               // Name and Comment use only single-byte runes that overlap with UTF-8.
+               f.NonUTF8 = false
+       default:
+               // Might be UTF-8, might be some other encoding; preserve existing flag.
+               // Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag.
+               // Since it is impossible to always distinguish valid UTF-8 from some
+               // other encoding (e.g., GBK or Shift-JIS), we trust the flag.
+               f.NonUTF8 = f.Flags&0x800 == 0
+       }
+
        needUSize := f.UncompressedSize == ^uint32(0)
        needCSize := f.CompressedSize == ^uint32(0)
        needHeaderOffset := f.headerOffset == int64(^uint32(0))
index d2d051b223b8048bcdefb05dcacd5de17d9d4818..5fa2c80afa92408cfe8543c147d8548e6b4a5317 100644 (file)
@@ -29,7 +29,8 @@ type ZipTest struct {
 type ZipTestFile struct {
        Name    string
        Mode    os.FileMode
-       ModTime time.Time // optional, modified time in format "mm-dd-yy hh:mm:ss"
+       NonUTF8 bool
+       ModTime time.Time
 
        // Information describing expected zip file content.
        // First, reading the entire content should produce the error ContentErr.
@@ -319,6 +320,68 @@ var tests = []ZipTest{
                        },
                },
        },
+       {
+               Name: "utf8-7zip.zip",
+               File: []ZipTestFile{
+                       {
+                               Name:    "世界",
+                               Content: []byte{},
+                               Mode:    0666,
+                               ModTime: time.Date(2017, 11, 6, 13, 9, 27, 867862500, timeZone(-8*time.Hour)),
+                       },
+               },
+       },
+       {
+               Name: "utf8-infozip.zip",
+               File: []ZipTestFile{
+                       {
+                               Name:    "世界",
+                               Content: []byte{},
+                               Mode:    0644,
+                               // Name is valid UTF-8, but format does not have UTF-8 flag set.
+                               // We don't do UTF-8 detection for multi-byte runes due to
+                               // false-positives with other encodings (e.g., Shift-JIS).
+                               // Format says encoding is not UTF-8, so we trust it.
+                               NonUTF8: true,
+                               ModTime: time.Date(2017, 11, 6, 13, 9, 27, 0, timeZone(-8*time.Hour)),
+                       },
+               },
+       },
+       {
+               Name: "utf8-osx.zip",
+               File: []ZipTestFile{
+                       {
+                               Name:    "世界",
+                               Content: []byte{},
+                               Mode:    0644,
+                               // Name is valid UTF-8, but format does not have UTF-8 set.
+                               NonUTF8: true,
+                               ModTime: time.Date(2017, 11, 6, 13, 9, 27, 0, timeZone(-8*time.Hour)),
+                       },
+               },
+       },
+       {
+               Name: "utf8-winrar.zip",
+               File: []ZipTestFile{
+                       {
+                               Name:    "世界",
+                               Content: []byte{},
+                               Mode:    0666,
+                               ModTime: time.Date(2017, 11, 6, 13, 9, 27, 867862500, timeZone(-8*time.Hour)),
+                       },
+               },
+       },
+       {
+               Name: "utf8-winzip.zip",
+               File: []ZipTestFile{
+                       {
+                               Name:    "世界",
+                               Content: []byte{},
+                               Mode:    0666,
+                               ModTime: time.Date(2017, 11, 6, 13, 9, 27, 867000000, timeZone(-8*time.Hour)),
+                       },
+               },
+       },
        {
                Name: "time-7zip.zip",
                File: []ZipTestFile{
index 668d018fdfeba246e0b49b4ed3bec73bb8bbcdf3..f2bc7be6a5ac048e687d1ece694bf6ec2d0ad728 100644 (file)
@@ -81,11 +81,24 @@ const (
 // See the zip spec for details.
 type FileHeader struct {
        // Name is the name of the file.
-       // It must be a relative path: it must not start with a drive
-       // letter (e.g. C:) or leading slash, and only forward slashes
-       // are allowed.
+       // It must be a relative path, not start with a drive letter (e.g. C:),
+       // and must use forward slashes instead of back slashes.
        Name string
 
+       // Comment is any arbitrary user-defined string shorter than 64KiB.
+       Comment string
+
+       // NonUTF8 indicates that Name and Comment are not encoded in UTF-8.
+       //
+       // By specification, the only other encoding permitted should be CP-437,
+       // but historically many ZIP readers interpret Name and Comment as whatever
+       // the system's local character encoding happens to be.
+       //
+       // This flag should only be set if the user intends to encode a non-portable
+       // ZIP file for a specific localized region. Otherwise, the Writer
+       // automatically sets the ZIP format's UTF-8 flag for valid UTF-8 strings.
+       NonUTF8 bool
+
        CreatorVersion uint16
        ReaderVersion  uint16
        Flags          uint16
@@ -111,7 +124,6 @@ type FileHeader struct {
        UncompressedSize64 uint64
        Extra              []byte
        ExternalAttrs      uint32 // Meaning depends on CreatorVersion
-       Comment            string
 }
 
 // FileInfo returns an os.FileInfo for the FileHeader.
diff --git a/src/archive/zip/testdata/utf8-7zip.zip b/src/archive/zip/testdata/utf8-7zip.zip
new file mode 100644 (file)
index 0000000..0e97884
Binary files /dev/null and b/src/archive/zip/testdata/utf8-7zip.zip differ
diff --git a/src/archive/zip/testdata/utf8-infozip.zip b/src/archive/zip/testdata/utf8-infozip.zip
new file mode 100644 (file)
index 0000000..25a8926
Binary files /dev/null and b/src/archive/zip/testdata/utf8-infozip.zip differ
diff --git a/src/archive/zip/testdata/utf8-osx.zip b/src/archive/zip/testdata/utf8-osx.zip
new file mode 100644 (file)
index 0000000..9b0c058
Binary files /dev/null and b/src/archive/zip/testdata/utf8-osx.zip differ
diff --git a/src/archive/zip/testdata/utf8-winrar.zip b/src/archive/zip/testdata/utf8-winrar.zip
new file mode 100644 (file)
index 0000000..4bad6c3
Binary files /dev/null and b/src/archive/zip/testdata/utf8-winrar.zip differ
diff --git a/src/archive/zip/testdata/utf8-winzip.zip b/src/archive/zip/testdata/utf8-winzip.zip
new file mode 100644 (file)
index 0000000..909d52e
Binary files /dev/null and b/src/archive/zip/testdata/utf8-winzip.zip differ
index 9fb9cee1ae82821e2213039ee236091634344234..7b33968618821435b3d94b70067aacb36da8446f 100644 (file)
@@ -216,12 +216,17 @@ func (w *Writer) Create(name string) (io.Writer, error) {
 }
 
 // detectUTF8 reports whether s is a valid UTF-8 string, and whether the string
-// must be considered UTF-8 encoding (i.e., not compatible with CP-437).
+// must be considered UTF-8 encoding (i.e., not compatible with CP-437, ASCII,
+// or any other common encoding).
 func detectUTF8(s string) (valid, require bool) {
        for _, r := range s {
-               // By default, ZIP uses CP-437,
-               // which is only identical to ASCII for the printable characters.
-               if r < 0x20 || r >= 0x7f {
+               // Officially, ZIP uses CP-437, but many readers use the system's
+               // local character encoding. Most encoding are compatible with a large
+               // subset of CP-437, which itself is ASCII-like.
+               //
+               // Forbid 0x7e and 0x5c since EUC-KR and Shift-JIS replace those
+               // characters with localized currency and overline characters.
+               if r < 0x20 || r > 0x7d || r == 0x5c {
                        if !utf8.ValidRune(r) || r == utf8.RuneError {
                                return false, false
                        }
@@ -267,12 +272,12 @@ func (w *Writer) CreateHeader(fh *FileHeader) (io.Writer, error) {
        //
        // For the case, where the user explicitly wants to specify the encoding
        // as UTF-8, they will need to set the flag bit themselves.
-       // TODO: For the case, where the user explicitly wants to specify that the
-       // encoding is *not* UTF-8, that is currently not possible.
-       // See golang.org/issue/10741.
        utf8Valid1, utf8Require1 := detectUTF8(fh.Name)
        utf8Valid2, utf8Require2 := detectUTF8(fh.Comment)
-       if (utf8Require1 || utf8Require2) && utf8Valid1 && utf8Valid2 {
+       switch {
+       case fh.NonUTF8:
+               fh.Flags &^= 0x800
+       case (utf8Require1 || utf8Require2) && (utf8Valid1 && utf8Valid2):
                fh.Flags |= 0x800
        }
 
index acca97e9b6985dc063e57b721fc1dbe7caf6a488..ee5c8663100940d88bbdb5425b2777c8be10350d 100644 (file)
@@ -137,6 +137,7 @@ func TestWriterUTF8(t *testing.T) {
                name    string
                comment string
                expect  uint16
+               nonUTF8 bool
        }{
                {
                        name:    "hi, hello",
@@ -148,6 +149,12 @@ func TestWriterUTF8(t *testing.T) {
                        comment: "in the world",
                        expect:  0x808,
                },
+               {
+                       name:    "hi, こんにちわ",
+                       comment: "in the world",
+                       nonUTF8: true,
+                       expect:  0x8,
+               },
                {
                        name:    "hi, hello",
                        comment: "in the 世界",
@@ -174,6 +181,7 @@ func TestWriterUTF8(t *testing.T) {
                h := &FileHeader{
                        Name:    test.name,
                        Comment: test.comment,
+                       NonUTF8: test.nonUTF8,
                        Method:  Deflate,
                }
                w, err := w.CreateHeader(h)