]> Cypherpunks repositories - gostls13.git/commitdiff
archive/tar: add support for long binary strings in GNU format
authorJoe Tsai <joetsai@digital-static.net>
Mon, 14 Aug 2017 23:44:15 +0000 (16:44 -0700)
committerJoe Tsai <thebrokentoaster@gmail.com>
Wed, 16 Aug 2017 00:39:32 +0000 (00:39 +0000)
The GNU tar format defines the following type flags:
TypeGNULongName = 'L' // Next file has a long name
TypeGNULongLink = 'K' // Next file symlinks to a file w/ a long name

Anytime a string exceeds the field dedicated to store it, the GNU format
permits a fake "file" to be prepended where that file entry has a Typeflag
of 'L' or 'K' and the contents of the file is a NUL-terminated string.

Contrary to previous TODO comments,
the GNU format supports arbitrary strings (without NUL) rather UTF-8 strings.
The manual says the following:
<<<
The name, linkname, magic, uname, and gname are
null-terminated character strings
>>>
<<<
All characters in header blocks are represented
by using 8-bit characters in the local variant of ASCII.
>>>

From this description, we gather the following:
* We must forbid NULs in any GNU strings
* Any 8-bit value (other than NUL) is permitted

Since the modern world has moved to UTF-8, it is really difficult to
determine what a "local variant of ASCII" means. For this reason,
we treat strings as just an arbitrary binary string (without NUL)
and leave it to the user to determine the encoding of this string.
(Practically, it seems that UTF-8 is the typical encoding used
in GNU archives seen in the wild).

The implementation of GNU tar seems to confirm this interpretation
of the manual where it permits any arbitrary binary string to exist
within these fields so long as they do not contain the NUL character.

 $ touch `echo -e "not\x80\x81\x82\x83utf8"`
 $ gnutar -H gnu --tar -cvf gnu-not-utf8.tar $(echo -e "not\x80\x81\x82\x83utf8")

The fact that we permit arbitrary binary in GNU strings goes
hand-in-hand with the fact that GNU also permits a "base-256" encoding
of numeric fields, which is effectively two-complement binary.

Change-Id: Ic037ec6bed306d07d1312f0058594bd9b64d9880
Reviewed-on: https://go-review.googlesource.com/55573
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>

src/archive/tar/common.go
src/archive/tar/reader_test.go
src/archive/tar/strconv.go
src/archive/tar/tar_test.go
src/archive/tar/testdata/gnu-not-utf8.tar [new file with mode: 0644]
src/archive/tar/testdata/gnu-utf8.tar [new file with mode: 0644]
src/archive/tar/writer.go
src/archive/tar/writer_test.go

index b1704a402d431993a65aec6c0a4ec03331dff1a7..795bc0a9169d982bd66258b882916345af082630 100644 (file)
@@ -85,14 +85,13 @@ func (h *Header) allowedFormats() (format int, paxHdrs map[string]string) {
        format = formatUSTAR | formatPAX | formatGNU
        paxHdrs = make(map[string]string)
 
-       verifyString := func(s string, size int, gnuLong bool, paxKey string) {
-
+       verifyString := func(s string, size int, paxKey string) {
                // NUL-terminator is optional for path and linkpath.
                // Technically, it is required for uname and gname,
                // but neither GNU nor BSD tar checks for it.
                tooLong := len(s) > size
-               if !isASCII(s) || (tooLong && !gnuLong) {
-                       // TODO(dsnet): GNU supports UTF-8 (without NUL) for strings.
+               allowLongGNU := paxKey == paxPath || paxKey == paxLinkpath
+               if hasNUL(s) || (tooLong && !allowLongGNU) {
                        format &^= formatGNU // No GNU
                }
                if !isASCII(s) || tooLong {
@@ -120,15 +119,16 @@ func (h *Header) allowedFormats() (format int, paxHdrs map[string]string) {
                        }
                }
        }
-       verifyTime := func(ts time.Time, size int, ustarField bool, paxKey string) {
+       verifyTime := func(ts time.Time, size int, paxKey string) {
                if ts.IsZero() {
                        return // Always okay
                }
                needsNano := ts.Nanosecond() != 0
+               hasFieldUSTAR := paxKey == paxMtime
                if !fitsInBase256(size, ts.Unix()) || needsNano {
                        format &^= formatGNU // No GNU
                }
-               if !fitsInOctal(size, ts.Unix()) || needsNano || !ustarField {
+               if !fitsInOctal(size, ts.Unix()) || needsNano || !hasFieldUSTAR {
                        format &^= formatUSTAR // No USTAR
                        if paxKey == paxNone {
                                format &^= formatPAX // No PAX
@@ -138,26 +138,23 @@ func (h *Header) allowedFormats() (format int, paxHdrs map[string]string) {
                }
        }
 
-       // TODO(dsnet): Add GNU long name support.
-       const supportGNULong = false
-
        var blk block
        v7 := blk.V7()
        ustar := blk.USTAR()
        gnu := blk.GNU()
-       verifyString(h.Name, len(v7.Name()), supportGNULong, paxPath)
-       verifyString(h.Linkname, len(v7.LinkName()), supportGNULong, paxLinkpath)
-       verifyString(h.Uname, len(ustar.UserName()), false, paxUname)
-       verifyString(h.Gname, len(ustar.GroupName()), false, paxGname)
+       verifyString(h.Name, len(v7.Name()), paxPath)
+       verifyString(h.Linkname, len(v7.LinkName()), paxLinkpath)
+       verifyString(h.Uname, len(ustar.UserName()), paxUname)
+       verifyString(h.Gname, len(ustar.GroupName()), paxGname)
        verifyNumeric(h.Mode, len(v7.Mode()), paxNone)
        verifyNumeric(int64(h.Uid), len(v7.UID()), paxUid)
        verifyNumeric(int64(h.Gid), len(v7.GID()), paxGid)
        verifyNumeric(h.Size, len(v7.Size()), paxSize)
        verifyNumeric(h.Devmajor, len(ustar.DevMajor()), paxNone)
        verifyNumeric(h.Devminor, len(ustar.DevMinor()), paxNone)
-       verifyTime(h.ModTime, len(v7.ModTime()), true, paxMtime)
-       verifyTime(h.AccessTime, len(gnu.AccessTime()), false, paxAtime)
-       verifyTime(h.ChangeTime, len(gnu.ChangeTime()), false, paxCtime)
+       verifyTime(h.ModTime, len(v7.ModTime()), paxMtime)
+       verifyTime(h.AccessTime, len(gnu.AccessTime()), paxAtime)
+       verifyTime(h.ChangeTime, len(gnu.ChangeTime()), paxCtime)
 
        if !isHeaderOnlyType(h.Typeflag) && h.Size < 0 {
                return formatUnknown, nil
index 79d271717c1b01487c9334a1a1cac22fd0b6f506..42d4ab7e143b9791cf84ecb3b7161fa779d5086a 100644 (file)
@@ -350,6 +350,41 @@ func TestReader(t *testing.T) {
                        Uname:    "rawr",
                        Gname:    "dsnet",
                }},
+       }, {
+               // This archive was generated by Writer but is readable by both
+               // GNU and BSD tar utilities.
+               // The archive generated by GNU is nearly byte-for-byte identical
+               // to the Go version except the Go version sets a negative Devminor
+               // just to force the GNU format.
+               file: "testdata/gnu-utf8.tar",
+               headers: []*Header{{
+                       Name: "☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹",
+                       Mode: 0644,
+                       Uid:  1000, Gid: 1000,
+                       ModTime:  time.Unix(0, 0),
+                       Typeflag: '0',
+                       Uname:    "☺",
+                       Gname:    "⚹",
+                       Devminor: -1,
+               }},
+       }, {
+               // This archive was generated by Writer but is readable by both
+               // GNU and BSD tar utilities.
+               // The archive generated by GNU is nearly byte-for-byte identical
+               // to the Go version except the Go version sets a negative Devminor
+               // just to force the GNU format.
+               file: "testdata/gnu-not-utf8.tar",
+               headers: []*Header{{
+                       Name:     "hi\x80\x81\x82\x83bye",
+                       Mode:     0644,
+                       Uid:      1000,
+                       Gid:      1000,
+                       ModTime:  time.Unix(0, 0),
+                       Typeflag: '0',
+                       Uname:    "rawr",
+                       Gname:    "dsnet",
+                       Devminor: -1,
+               }},
        }, {
                // BSD tar v3.1.2 and GNU tar v1.27.1 both rejects PAX records
                // with NULs in the key.
index 89ac8112e5fb291f2758a3dbada8ff4873887680..e02963b74ba7ea1e5e5323ecbaae5a06106b6011 100644 (file)
@@ -12,6 +12,11 @@ import (
        "time"
 )
 
+// hasNUL reports whether the NUL character exists within s.
+func hasNUL(s string) bool {
+       return strings.IndexByte(s, 0) >= 0
+}
+
 // isASCII reports whether the input is an ASCII C-style string.
 func isASCII(s string) bool {
        for _, c := range s {
@@ -59,7 +64,6 @@ func (f *formatter) formatString(b []byte, s string) {
        if len(s) > len(b) {
                f.err = ErrFieldTooLong
        }
-       s = toASCII(s) // TODO(dsnet): Remove this for UTF-8 support in GNU format
        copy(b, s)
        if len(s) < len(b) {
                b[len(s)] = 0
@@ -307,8 +311,8 @@ func validPAXRecord(k, v string) bool {
        }
        switch k {
        case paxPath, paxLinkpath, paxUname, paxGname:
-               return strings.IndexByte(v, 0) < 0
+               return !hasNUL(v)
        default:
-               return strings.IndexByte(k, 0) < 0
+               return !hasNUL(k)
        }
 }
index 22e23a6bbdc5fb307e9427bbb883ac3d902f72e1..1a38ecb446b29ff90c0f29b12ff83939d30062fc 100644 (file)
@@ -386,7 +386,7 @@ func TestHeaderAllowedFormats(t *testing.T) {
                formats: formatUnknown,
        }, {
                header:  &Header{Name: "用戶名", Devmajor: -1 << 56},
-               formats: formatUnknown,
+               formats: formatGNU,
        }, {
                header:  &Header{Size: math.MaxInt64},
                paxHdrs: map[string]string{paxSize: "9223372036854775807"},
@@ -408,10 +408,14 @@ func TestHeaderAllowedFormats(t *testing.T) {
        }, {
                header:  &Header{Name: strings.Repeat("a", nameSize)},
                formats: formatUSTAR | formatPAX | formatGNU,
+       }, {
+               header:  &Header{Name: strings.Repeat("a", nameSize+1)},
+               paxHdrs: map[string]string{paxPath: strings.Repeat("a", nameSize+1)},
+               formats: formatPAX | formatGNU,
        }, {
                header:  &Header{Linkname: "用戶名"},
                paxHdrs: map[string]string{paxLinkpath: "用戶名"},
-               formats: formatPAX,
+               formats: formatPAX | formatGNU,
        }, {
                header:  &Header{Linkname: strings.Repeat("用戶名\x00", nameSize)},
                paxHdrs: map[string]string{paxLinkpath: strings.Repeat("用戶名\x00", nameSize)},
diff --git a/src/archive/tar/testdata/gnu-not-utf8.tar b/src/archive/tar/testdata/gnu-not-utf8.tar
new file mode 100644 (file)
index 0000000..34b4c57
Binary files /dev/null and b/src/archive/tar/testdata/gnu-not-utf8.tar differ
diff --git a/src/archive/tar/testdata/gnu-utf8.tar b/src/archive/tar/testdata/gnu-utf8.tar
new file mode 100644 (file)
index 0000000..dde941c
Binary files /dev/null and b/src/archive/tar/testdata/gnu-utf8.tar differ
index a918ff3eef99485c674334549a570ed1f55ab6b6..ffef29af1089ca63254eb3715ea363e2d9961970 100644 (file)
@@ -121,12 +121,10 @@ func (tw *Writer) writePAXHeader(hdr *Header, paxHdrs map[string]string) error {
        }
 
        // Pack the main header.
-       var f formatter
-       blk := tw.templateV7Plus(hdr, f.formatString, f.formatOctal)
+       var f formatter // Ignore errors since they are expected
+       fmtStr := func(b []byte, s string) { f.formatString(b, toASCII(s)) }
+       blk := tw.templateV7Plus(hdr, fmtStr, f.formatOctal)
        blk.SetFormat(formatPAX)
-       if f.err != nil && len(paxHdrs) == 0 {
-               return f.err // Should never happen, otherwise PAX headers would be used
-       }
        return tw.writeRawHeader(blk, hdr.Size, hdr.Typeflag)
 }
 
@@ -134,10 +132,23 @@ func (tw *Writer) writeGNUHeader(hdr *Header) error {
        // TODO(dsnet): Support writing sparse files.
        // See https://golang.org/issue/13548
 
-       // TODO(dsnet): Support long filenames (with UTF-8) support.
+       // Use long-link files if Name or Linkname exceeds the field size.
+       const longName = "././@LongLink"
+       if len(hdr.Name) > nameSize {
+               data := hdr.Name + "\x00"
+               if err := tw.writeRawFile(longName, data, TypeGNULongName, formatGNU); err != nil {
+                       return err
+               }
+       }
+       if len(hdr.Linkname) > nameSize {
+               data := hdr.Linkname + "\x00"
+               if err := tw.writeRawFile(longName, data, TypeGNULongLink, formatGNU); err != nil {
+                       return err
+               }
+       }
 
        // Pack the main header.
-       var f formatter
+       var f formatter // Ignore errors since they are expected
        blk := tw.templateV7Plus(hdr, f.formatString, f.formatNumeric)
        if !hdr.AccessTime.IsZero() {
                f.formatNumeric(blk.GNU().AccessTime(), hdr.AccessTime.Unix())
@@ -146,9 +157,6 @@ func (tw *Writer) writeGNUHeader(hdr *Header) error {
                f.formatNumeric(blk.GNU().ChangeTime(), hdr.ChangeTime.Unix())
        }
        blk.SetFormat(formatGNU)
-       if f.err != nil {
-               return f.err // Should never happen since header is validated
-       }
        return tw.writeRawHeader(blk, hdr.Size, hdr.Typeflag)
 }
 
index a6eec5a9da9d97d16585de5e1d205d4b8a615b8e..9d92ab89a6e13f80f38d462858a36de38b329730 100644 (file)
@@ -219,6 +219,35 @@ func TestWriter(t *testing.T) {
                        },
                }},
                err: ErrHeader,
+       }, {
+               file: "testdata/gnu-utf8.tar",
+               entries: []*entry{{
+                       header: &Header{
+                               Name: "☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹",
+                               Mode: 0644,
+                               Uid:  1000, Gid: 1000,
+                               ModTime:  time.Unix(0, 0),
+                               Typeflag: '0',
+                               Uname:    "☺",
+                               Gname:    "⚹",
+                               Devminor: -1, // Force use of GNU format
+                       },
+               }},
+       }, {
+               file: "testdata/gnu-not-utf8.tar",
+               entries: []*entry{{
+                       header: &Header{
+                               Name:     "hi\x80\x81\x82\x83bye",
+                               Mode:     0644,
+                               Uid:      1000,
+                               Gid:      1000,
+                               ModTime:  time.Unix(0, 0),
+                               Typeflag: '0',
+                               Uname:    "rawr",
+                               Gname:    "dsnet",
+                               Devminor: -1, // Force use of GNU format
+                       },
+               }},
        }}
 
        for _, v := range vectors {