]> Cypherpunks repositories - gostls13.git/commitdiff
compress/flate: make library RFC1951 compliant
authorJoe Tsai <joetsai@digital-static.net>
Thu, 11 Jun 2015 23:33:52 +0000 (16:33 -0700)
committerNigel Tao <nigeltao@golang.org>
Wed, 17 Jun 2015 03:21:49 +0000 (03:21 +0000)
Corrected several issues:
 * RFC1951 section 3.2.7 dictates that it is okay for the HDist tree to have a
single code of zero bits. Furthermore, the behavior of the C zlib library
permits empty trees even when there are more than one codes.
 * RFC1951 section 3.2.5 shows that HLit codes 286 and 287 are invalid. Thus,
Go's implementation should choke on inputs using these codes.
 * RFC1951 section 3.2.5 and 3.2.7 are ambiguous about whether the number of
HDist codes can be greater than 30. The C zlib library (which is the canonical
reference implementation) performs this check here:
https://github.com/madler/zlib/blob/62d6112a7981ad7c34f3b43cffdf00d4662a4f25/inflate.c#L906

In addition, a number of test cases were added to the unit tests that exercises
these edge cases. The test cases listed in TestStreams will either fail or
succeed in a manner matching the behaviour of the C zlib version. Given that the
C zlib implementation is the reference for the world, Go's implementation should
match C zlib behaviour.

Fixes #11030

Change-Id: Ic24e4e40ce5832c7e1930249246e86d34bfedaa6
Reviewed-on: https://go-review.googlesource.com/11000
Reviewed-by: Nigel Tao <nigeltao@golang.org>
src/compress/flate/flate_test.go
src/compress/flate/gen.go
src/compress/flate/huffman_bit_writer.go
src/compress/flate/huffman_code.go
src/compress/flate/inflate.go

index 06d35a066a2ac30984021b088f860e5efb543f02..3f67025cd76b5f98ba26bc615dd87a80d28bd583 100644 (file)
@@ -10,23 +10,11 @@ package flate
 
 import (
        "bytes"
+       "encoding/hex"
        "io/ioutil"
-       "strings"
        "testing"
 )
 
-func TestUncompressedSource(t *testing.T) {
-       decoder := NewReader(bytes.NewReader([]byte{0x01, 0x01, 0x00, 0xfe, 0xff, 0x11}))
-       output := make([]byte, 1)
-       n, error := decoder.Read(output)
-       if n != 1 || error != nil {
-               t.Fatalf("decoder.Read() = %d, %v, want 1, nil", n, error)
-       }
-       if output[0] != 0x11 {
-               t.Errorf("output[0] = %x, want 0x11", output[0])
-       }
-}
-
 // The following test should not panic.
 func TestIssue5915(t *testing.T) {
        bits := []int{4, 0, 0, 6, 4, 3, 2, 3, 3, 4, 4, 5, 0, 0, 0, 0, 5, 5, 6,
@@ -90,29 +78,183 @@ func TestInvalidBits(t *testing.T) {
        }
 }
 
-func TestDegenerateHuffmanCoding(t *testing.T) {
-       const (
-               want = "abcabc"
-               // This compressed form has a dynamic Huffman block, even though a
-               // sensible encoder would use a literal data block, as the latter is
-               // shorter. Still, it is a valid flate compression of "abcabc". It has
-               // a degenerate Huffman table with only one coded value: the one
-               // non-literal back-ref copy of the first "abc" to the second "abc".
-               //
-               // To verify that this is decompressible with zlib (the C library),
-               // it's easy to use the Python wrapper library:
-               // >>> import zlib
-               // >>> compressed = "\x0c\xc2...etc...\xff\xff"
-               // >>> zlib.decompress(compressed, -15) # negative means no GZIP header.
-               // 'abcabc'
-               compressed = "\x0c\xc2\x01\x0d\x00\x00\x00\x82\xb0\xac\x4a\xff\x0e\xb0\x7d\x27" +
-                       "\x06\x00\x00\xff\xff"
-       )
-       b, err := ioutil.ReadAll(NewReader(strings.NewReader(compressed)))
-       if err != nil {
-               t.Fatal(err)
-       }
-       if got := string(b); got != want {
-               t.Fatalf("got %q, want %q", got, want)
+func TestStreams(t *testing.T) {
+       // To verify any of these hexstrings as valid or invalid flate streams
+       // according to the C zlib library, you can use the Python wrapper library:
+       // >>> hex_string = "010100feff11"
+       // >>> import zlib
+       // >>> zlib.decompress(hex_string.decode("hex"), -15) # Negative means raw DEFLATE
+       // '\x11'
+
+       testCases := []struct {
+               desc   string // Description of the stream
+               stream string // Hexstring of the input DEFLATE stream
+               want   string // Expected result. Use "fail" to expect failure
+       }{{
+               "degenerate HCLenTree",
+               "05e0010000000000100000000000000000000000000000000000000000000000" +
+                       "00000000000000000004",
+               "fail",
+       }, {
+               "complete HCLenTree, empty HLitTree, empty HDistTree",
+               "05e0010400000000000000000000000000000000000000000000000000000000" +
+                       "00000000000000000010",
+               "fail",
+       }, {
+               "empty HCLenTree",
+               "05e0010000000000000000000000000000000000000000000000000000000000" +
+                       "00000000000000000010",
+               "fail",
+       }, {
+               "complete HCLenTree, complete HLitTree, empty HDistTree, use missing HDist symbol",
+               "000100feff000de0010400000000100000000000000000000000000000000000" +
+                       "0000000000000000000000000000002c",
+               "fail",
+       }, {
+               "complete HCLenTree, complete HLitTree, degenerate HDistTree, use missing HDist symbol",
+               "000100feff000de0010000000000000000000000000000000000000000000000" +
+                       "00000000000000000610000000004070",
+               "fail",
+       }, {
+               "complete HCLenTree, empty HLitTree, empty HDistTree",
+               "05e0010400000000100400000000000000000000000000000000000000000000" +
+                       "0000000000000000000000000008",
+               "fail",
+       }, {
+               "complete HCLenTree, empty HLitTree, degenerate HDistTree",
+               "05e0010400000000100400000000000000000000000000000000000000000000" +
+                       "0000000000000000000800000008",
+               "fail",
+       }, {
+               "complete HCLenTree, degenerate HLitTree, degenerate HDistTree, use missing HLit symbol",
+               "05e0010400000000100000000000000000000000000000000000000000000000" +
+                       "0000000000000000001c",
+               "fail",
+       }, {
+               "complete HCLenTree, complete HLitTree, too large HDistTree",
+               "edff870500000000200400000000000000000000000000000000000000000000" +
+                       "000000000000000000080000000000000004",
+               "fail",
+       }, {
+               "complete HCLenTree, complete HLitTree, empty HDistTree, excessive repeater code",
+               "edfd870500000000200400000000000000000000000000000000000000000000" +
+                       "000000000000000000e8b100",
+               "fail",
+       }, {
+               "complete HCLenTree, complete HLitTree, empty HDistTree of normal length 30",
+               "05fd01240000000000f8ffffffffffffffffffffffffffffffffffffffffffff" +
+                       "ffffffffffffffffff07000000fe01",
+               "",
+       }, {
+               "complete HCLenTree, complete HLitTree, empty HDistTree of excessive length 31",
+               "05fe01240000000000f8ffffffffffffffffffffffffffffffffffffffffffff" +
+                       "ffffffffffffffffff07000000fc03",
+               "fail",
+       }, {
+               "complete HCLenTree, over-subscribed HLitTree, empty HDistTree",
+               "05e001240000000000fcffffffffffffffffffffffffffffffffffffffffffff" +
+                       "ffffffffffffffffff07f00f",
+               "fail",
+       }, {
+               "complete HCLenTree, under-subscribed HLitTree, empty HDistTree",
+               "05e001240000000000fcffffffffffffffffffffffffffffffffffffffffffff" +
+                       "fffffffffcffffffff07f00f",
+               "fail",
+       }, {
+               "complete HCLenTree, complete HLitTree with single code, empty HDistTree",
+               "05e001240000000000f8ffffffffffffffffffffffffffffffffffffffffffff" +
+                       "ffffffffffffffffff07f00f",
+               "01",
+       }, {
+               "complete HCLenTree, complete HLitTree with multiple codes, empty HDistTree",
+               "05e301240000000000f8ffffffffffffffffffffffffffffffffffffffffffff" +
+                       "ffffffffffffffffff07807f",
+               "01",
+       }, {
+               "complete HCLenTree, complete HLitTree, degenerate HDistTree, use valid HDist symbol",
+               "000100feff000de0010400000000100000000000000000000000000000000000" +
+                       "0000000000000000000000000000003c",
+               "00000000",
+       }, {
+               "complete HCLenTree, degenerate HLitTree, degenerate HDistTree",
+               "05e0010400000000100000000000000000000000000000000000000000000000" +
+                       "0000000000000000000c",
+               "",
+       }, {
+               "complete HCLenTree, degenerate HLitTree, empty HDistTree",
+               "05e0010400000000100000000000000000000000000000000000000000000000" +
+                       "00000000000000000004",
+               "",
+       }, {
+               "complete HCLenTree, complete HLitTree, empty HDistTree, spanning repeater code",
+               "edfd870500000000200400000000000000000000000000000000000000000000" +
+                       "000000000000000000e8b000",
+               "",
+       }, {
+               "complete HCLenTree with length codes, complete HLitTree, empty HDistTree",
+               "ede0010400000000100000000000000000000000000000000000000000000000" +
+                       "0000000000000000000400004000",
+               "",
+       }, {
+               "complete HCLenTree, complete HLitTree, degenerate HDistTree, use valid HLit symbol 284 with count 31",
+               "000100feff00ede0010400000000100000000000000000000000000000000000" +
+                       "000000000000000000000000000000040000407f00",
+               "0000000000000000000000000000000000000000000000000000000000000000" +
+                       "0000000000000000000000000000000000000000000000000000000000000000" +
+                       "0000000000000000000000000000000000000000000000000000000000000000" +
+                       "0000000000000000000000000000000000000000000000000000000000000000" +
+                       "0000000000000000000000000000000000000000000000000000000000000000" +
+                       "0000000000000000000000000000000000000000000000000000000000000000" +
+                       "0000000000000000000000000000000000000000000000000000000000000000" +
+                       "0000000000000000000000000000000000000000000000000000000000000000" +
+                       "000000",
+       }, {
+               "complete HCLenTree, complete HLitTree, degenerate HDistTree, use valid HLit and HDist symbols",
+               "0cc2010d00000082b0ac4aff0eb07d27060000ffff",
+               "616263616263",
+       }, {
+               "fixed block, use reserved symbol 287",
+               "33180700",
+               "fail",
+       }, {
+               "raw block",
+               "010100feff11",
+               "11",
+       }, {
+               "issue 10426 - over-subscribed HCLenTree causes a hang",
+               "344c4a4e494d4b070000ff2e2eff2e2e2e2e2eff",
+               "fail",
+       }, {
+               "issue 11030 - empty HDistTree unexpectedly leads to error",
+               "05c0070600000080400fff37a0ca",
+               "",
+       }, {
+               "issue 11033 - empty HDistTree unexpectedly leads to error",
+               "050fb109c020cca5d017dcbca044881ee1034ec149c8980bbc413c2ab35be9dc" +
+                       "b1473449922449922411202306ee97b0383a521b4ffdcf3217f9f7d3adb701",
+               "3130303634342068652e706870005d05355f7ed957ff084a90925d19e3ebc6d0" +
+                       "c6d7",
+       }}
+
+       for i, tc := range testCases {
+               data, err := hex.DecodeString(tc.stream)
+               if err != nil {
+                       t.Fatal(err)
+               }
+               data, err = ioutil.ReadAll(NewReader(bytes.NewReader(data)))
+               if tc.want == "fail" {
+                       if err == nil {
+                               t.Errorf("#%d (%s): got nil error, want non-nil", i, tc.desc)
+                       }
+               } else {
+                       if err != nil {
+                               t.Errorf("#%d (%s): %v", i, tc.desc, err)
+                               continue
+                       }
+                       if got := hex.EncodeToString(data); got != tc.want {
+                               t.Errorf("#%d (%s):\ngot  %q\nwant %q", i, tc.desc, got, tc.want)
+                       }
+
+               }
        }
 }
index eeafa84a5d0c0b07a5067a6651cbfe0f76d5f025..154c89a488e08c2ed8d321e6034d88124a7e0b8e 100644 (file)
@@ -45,6 +45,10 @@ type huffmanDecoder struct {
 }
 
 // Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
 func (h *huffmanDecoder) init(bits []int) bool {
        // Sanity enables additional runtime tests during Huffman
        // table construction.  It's intended to be used during
@@ -71,8 +75,16 @@ func (h *huffmanDecoder) init(bits []int) bool {
                }
                count[n]++
        }
+
+       // Empty tree. The decompressor.huffSym function will fail later if the tree
+       // is used. Technically, an empty tree is only valid for the HDIST tree and
+       // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+       // is guaranteed to fail since it will attempt to use the tree to decode the
+       // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+       // guaranteed to fail later since the compressed data section must be
+       // composed of at least one symbol (the end-of-block marker).
        if max == 0 {
-               return false
+               return true
        }
 
        code := 0
index b182a710b9af9ab3767c8684d68c9ae2a7fab48c..616440412e4b2b869d22859bd873ea43a8145286 100644 (file)
@@ -87,11 +87,11 @@ type huffmanBitWriter struct {
 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
        return &huffmanBitWriter{
                w:               w,
-               literalFreq:     make([]int32, maxLit),
+               literalFreq:     make([]int32, maxNumLit),
                offsetFreq:      make([]int32, offsetCodeCount),
-               codegen:         make([]uint8, maxLit+offsetCodeCount+1),
+               codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
                codegenFreq:     make([]int32, codegenCodeCount),
-               literalEncoding: newHuffmanEncoder(maxLit),
+               literalEncoding: newHuffmanEncoder(maxNumLit),
                offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
                codegenEncoding: newHuffmanEncoder(codegenCodeCount),
        }
index 3b9fce466ed242667d1fe024342cacac4af3013b..50ec79c94051e1ba610e5926c77633514ffb83ab 100644 (file)
@@ -47,11 +47,11 @@ func newHuffmanEncoder(size int) *huffmanEncoder {
 
 // Generates a HuffmanCode corresponding to the fixed literal table
 func generateFixedLiteralEncoding() *huffmanEncoder {
-       h := newHuffmanEncoder(maxLit)
+       h := newHuffmanEncoder(maxNumLit)
        codeBits := h.codeBits
        code := h.code
        var ch uint16
-       for ch = 0; ch < maxLit; ch++ {
+       for ch = 0; ch < maxNumLit; ch++ {
                var bits uint16
                var size uint8
                switch {
index 6f88159dfac78362a88cbb3179e5f999f0b379d0..04372dec2419c86a7f66b9965cfc0d16fffdaa38 100644 (file)
@@ -18,10 +18,12 @@ import (
 const (
        maxCodeLen = 16    // max length of Huffman code
        maxHist    = 32768 // max history required
-       // The next three numbers come from the RFC, section 3.2.7.
-       maxLit   = 286
-       maxDist  = 32
-       numCodes = 19 // number of codes in Huffman meta-code
+       // The next three numbers come from the RFC section 3.2.7, with the
+       // additional proviso in section 3.2.5 which implies that distance codes
+       // 30 and 31 should never occur in compressed data.
+       maxNumLit  = 286
+       maxNumDist = 30
+       numCodes   = 19 // number of codes in Huffman meta-code
 )
 
 // A CorruptInputError reports the presence of corrupt input at a given offset.
@@ -101,6 +103,10 @@ type huffmanDecoder struct {
 }
 
 // Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
 func (h *huffmanDecoder) init(bits []int) bool {
        // Sanity enables additional runtime tests during Huffman
        // table construction.  It's intended to be used during
@@ -127,8 +133,16 @@ func (h *huffmanDecoder) init(bits []int) bool {
                }
                count[n]++
        }
+
+       // Empty tree. The decompressor.huffSym function will fail later if the tree
+       // is used. Technically, an empty tree is only valid for the HDIST tree and
+       // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+       // is guaranteed to fail since it will attempt to use the tree to decode the
+       // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+       // guaranteed to fail later since the compressed data section must be
+       // composed of at least one symbol (the end-of-block marker).
        if max == 0 {
-               return false
+               return true
        }
 
        code := 0
@@ -258,7 +272,7 @@ type decompressor struct {
        h1, h2 huffmanDecoder
 
        // Length arrays used to define Huffman codes.
-       bits     *[maxLit + maxDist]int
+       bits     *[maxNumLit + maxNumDist]int
        codebits *[numCodes]int
 
        // Output history, buffer.
@@ -356,12 +370,14 @@ func (f *decompressor) readHuffman() error {
                }
        }
        nlit := int(f.b&0x1F) + 257
-       if nlit > maxLit {
+       if nlit > maxNumLit {
                return CorruptInputError(f.roffset)
        }
        f.b >>= 5
        ndist := int(f.b&0x1F) + 1
-       // maxDist is 32, so ndist is always valid.
+       if ndist > maxNumDist {
+               return CorruptInputError(f.roffset)
+       }
        f.b >>= 5
        nclen := int(f.b&0xF) + 4
        // numCodes is 19, so nclen is always valid.
@@ -492,9 +508,12 @@ func (f *decompressor) huffmanBlock() {
                case v < 285:
                        length = v*32 - (281*32 - 131)
                        n = 5
-               default:
+               case v < maxNumLit:
                        length = 258
                        n = 0
+               default:
+                       f.err = CorruptInputError(f.roffset)
+                       return
                }
                if n > 0 {
                        for f.nb < n {
@@ -529,10 +548,7 @@ func (f *decompressor) huffmanBlock() {
                switch {
                case dist < 4:
                        dist++
-               case dist >= 30:
-                       f.err = CorruptInputError(f.roffset)
-                       return
-               default:
+               case dist < maxNumDist:
                        nb := uint(dist-2) >> 1
                        // have 1 bit in bottom of dist, need nb more.
                        extra := (dist & 1) << nb
@@ -546,6 +562,9 @@ func (f *decompressor) huffmanBlock() {
                        f.b >>= nb
                        f.nb -= nb
                        dist = 1<<(nb+1) + 1 + extra
+               default:
+                       f.err = CorruptInputError(f.roffset)
+                       return
                }
 
                // Copy history[-dist:-dist+length] into output.
@@ -692,6 +711,10 @@ func (f *decompressor) moreBits() error {
 
 // Read the next Huffman-encoded symbol from f according to h.
 func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
+       // Since a huffmanDecoder can be empty or be composed of a degenerate tree
+       // with single element, huffSym must error on these two edge cases. In both
+       // cases, the chunks slice will be 0 for the invalid sequence, leading it
+       // satisfy the n == 0 check below.
        n := uint(h.min)
        for {
                for f.nb < n {
@@ -761,7 +784,7 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
 // The ReadCloser returned by NewReader also implements Resetter.
 func NewReader(r io.Reader) io.ReadCloser {
        var f decompressor
-       f.bits = new([maxLit + maxDist]int)
+       f.bits = new([maxNumLit + maxNumDist]int)
        f.codebits = new([numCodes]int)
        f.r = makeReader(r)
        f.hist = new([maxHist]byte)
@@ -780,7 +803,7 @@ func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
        var f decompressor
        f.r = makeReader(r)
        f.hist = new([maxHist]byte)
-       f.bits = new([maxLit + maxDist]int)
+       f.bits = new([maxNumLit + maxNumDist]int)
        f.codebits = new([numCodes]int)
        f.step = (*decompressor).nextBlock
        f.setDict(dict)