From 5cc8561333313c4318d980fea9b5c2dab596b503 Mon Sep 17 00:00:00 2001
From: Matthew Dempsky <mdempsky@google.com>
Date: Mon, 13 Apr 2015 15:31:28 -0700
Subject: [PATCH] compress/flate: reject invalid Huffman bit sizes

If the requested coding bit sizes don't result in a full binary tree,
then reject the input as invalid.

Exception: We still need to allow degenerate Huffman codings with a
single 1-bit code to be compatible with zlib and files compressed with
Go's compress/flate package.

Update #10426.

Change-Id: I171b98d12e65b4deb9f4031cd802407ebb5e266c
Reviewed-on: https://go-review.googlesource.com/8922
Reviewed-by: Nigel Tao <nigeltao@golang.org>
---
 src/compress/flate/flate_test.go |  75 ++++++++++++++++++--
 src/compress/flate/gen.go        | 113 ++++++++++++++++++++++++-------
 src/compress/flate/inflate.go    |  68 +++++++++----------
 3 files changed, 188 insertions(+), 68 deletions(-)

diff --git a/src/compress/flate/flate_test.go b/src/compress/flate/flate_test.go
index 5483641510..abf4784932 100644
--- a/src/compress/flate/flate_test.go
+++ b/src/compress/flate/flate_test.go
@@ -10,6 +10,8 @@ package flate
 
 import (
 	"bytes"
+	"encoding/hex"
+	"io/ioutil"
 	"testing"
 )
 
@@ -30,9 +32,8 @@ func TestIssue5915(t *testing.T) {
 	bits := []int{4, 0, 0, 6, 4, 3, 2, 3, 3, 4, 4, 5, 0, 0, 0, 0, 5, 5, 6,
 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 6, 0, 11, 0, 8, 0, 6, 6, 10, 8}
-	h := new(huffmanDecoder)
-	ok := h.init(bits)
-	if ok == true {
+	var h huffmanDecoder
+	if h.init(bits) {
 		t.Fatalf("Given sequence of bits is bad, and should not succeed.")
 	}
 }
@@ -41,9 +42,8 @@ func TestIssue5915(t *testing.T) {
 func TestIssue5962(t *testing.T) {
 	bits := []int{4, 0, 0, 6, 4, 3, 2, 3, 3, 4, 4, 5, 0, 0, 0, 0,
 		5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11}
-	h := new(huffmanDecoder)
-	ok := h.init(bits)
-	if ok == true {
+	var h huffmanDecoder
+	if h.init(bits) {
 		t.Fatalf("Given sequence of bits is bad, and should not succeed.")
 	}
 }
@@ -52,7 +52,7 @@ func TestIssue5962(t *testing.T) {
 func TestIssue6255(t *testing.T) {
 	bits1 := []int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11}
 	bits2 := []int{11, 13}
-	h := new(huffmanDecoder)
+	var h huffmanDecoder
 	if !h.init(bits1) {
 		t.Fatalf("Given sequence of bits is good and should succeed.")
 	}
@@ -77,3 +77,64 @@ func TestInvalidEncoding(t *testing.T) {
 		t.Fatal("Should have rejected invalid bit sequence")
 	}
 }
+
+func TestInvalidBits(t *testing.T) {
+	oversubscribed := []int{1, 2, 3, 4, 4, 5}
+	incomplete := []int{1, 2, 4, 4}
+	var h huffmanDecoder
+	if h.init(oversubscribed) {
+		t.Fatal("Should reject oversubscribed bit-length set")
+	}
+	if h.init(incomplete) {
+		t.Fatal("Should reject incomplete bit-length set")
+	}
+}
+
+func TestDegenerateHuffmanCoding(t *testing.T) {
+	// This test case is notable because:
+	//   1. It's decompressable by zlib.
+	//   2. It was generated by Go 1.4's compress/flate package.
+	//   3. It uses a degenerate dynamic Huffman coding block.
+	//
+	// The input is somewhat contrived though.  It's a sequence of
+	// 258 bytes with no 3+ byte sequence occuring more than once,
+	// except that the whole sequence is repeated twice.  This
+	// results in package flate emitting a single match token,
+	// which consequently means a single symbol in the distance
+	// coding table.
+	//
+	// Additionally, it uses very few unique byte values so that
+	// the overhead from storing the dynamic Huffman coding still
+	// results in a smaller encoding than using the fixed Huffman
+	// coding.
+	const (
+		originalHalf = "00013534215002452243512505010034133042401113" +
+			"400415101454022532410254513251155411055331124453555" +
+			"023120320201523334303524252551414033503012344230210" +
+			"310431305153005314321221315440455204052144332205422" +
+			"235434504441211420062622646656236416326065565261624" +
+			"6256136546"
+		compressedHex = "ecd081000030104251a5fad5f9a34d640a4f92b3144" +
+			"fa28366669a2ca54e542adba954cf7257c1422dd639ccde6a6b" +
+			"4b6cda659b885110f248d228a38ccd75954c91494b8415ab713" +
+			"42fd2e20683e3b5ea86aae13601ad40d6746a6bec221d07d7bb" +
+			"1db9fac2e9b61be7a3c7ceb9f5bec00b0000ffffecd08100003" +
+			"0104251a5fad5f9a34d640a4f92b3144fa28366669a2ca54e54" +
+			"2adba954cf7257c1422dd639ccde6a6b4b6cda659b885110f24" +
+			"8d228a38ccd75954c91494b8415ab71342fd2e20683e3b5ea86" +
+			"aae13601ad40d6746a6bec221d07d7bb1db9fac2e9b61be7a3c" +
+			"7ceb9f5bec00b0000ffff"
+	)
+
+	compressed, err := hex.DecodeString(compressedHex)
+	if err != nil {
+		t.Fatal(err)
+	}
+	data, err := ioutil.ReadAll(NewReader(bytes.NewReader(compressed)))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if string(data) != originalHalf+originalHalf {
+		t.Fatal("Decompressed data does not match original")
+	}
+}
diff --git a/src/compress/flate/gen.go b/src/compress/flate/gen.go
index 6288ecddd0..eeafa84a5d 100644
--- a/src/compress/flate/gen.go
+++ b/src/compress/flate/gen.go
@@ -46,6 +46,15 @@ type huffmanDecoder struct {
 
 // Initialize Huffman decoding tables from array of code lengths.
 func (h *huffmanDecoder) init(bits []int) bool {
+	// Sanity enables additional runtime tests during Huffman
+	// table construction.  It's intended to be used during
+	// development to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if h.min != 0 {
+		*h = huffmanDecoder{}
+	}
+
 	// Count number of codes of each length,
 	// compute min and max length.
 	var count [maxCodeLen]int
@@ -66,33 +75,41 @@ func (h *huffmanDecoder) init(bits []int) bool {
 		return false
 	}
 
-	h.min = min
-	var linkBits uint
-	var numLinks int
-	if max > huffmanChunkBits {
-		linkBits = uint(max) - huffmanChunkBits
-		numLinks = 1 << linkBits
-		h.linkMask = uint32(numLinks - 1)
-	}
 	code := 0
 	var nextcode [maxCodeLen]int
 	for i := min; i <= max; i++ {
-		if i == huffmanChunkBits+1 {
-			// create link tables
-			link := code >> 1
-			h.links = make([][]uint32, huffmanNumChunks-link)
-			for j := uint(link); j < huffmanNumChunks; j++ {
-				reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
-				reverse >>= uint(16 - huffmanChunkBits)
-				off := j - uint(link)
-				h.chunks[reverse] = uint32(off<<huffmanValueShift + uint(i))
-				h.links[off] = make([]uint32, 1<<linkBits)
+		code <<= 1
+		nextcode[i] = code
+		code += count[i]
+	}
+
+	// Check that the coding is complete (i.e., that we've
+	// assigned all 2-to-the-max possible bit sequences).
+	// Exception: To be compatible with zlib, we also need to
+	// accept degenerate single-code codings.  See also
+	// TestDegenerateHuffmanCoding.
+	if code != 1<<uint(max) && !(code == 1 && max == 1) {
+		return false
+	}
+
+	h.min = min
+	if max > huffmanChunkBits {
+		numLinks := 1 << (uint(max) - huffmanChunkBits)
+		h.linkMask = uint32(numLinks - 1)
+
+		// create link tables
+		link := nextcode[huffmanChunkBits+1] >> 1
+		h.links = make([][]uint32, huffmanNumChunks-link)
+		for j := uint(link); j < huffmanNumChunks; j++ {
+			reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
+			reverse >>= uint(16 - huffmanChunkBits)
+			off := j - uint(link)
+			if sanity && h.chunks[reverse] != 0 {
+				panic("impossible: overwriting existing chunk")
 			}
+			h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			h.links[off] = make([]uint32, numLinks)
 		}
-		n := count[i]
-		nextcode[i] = code
-		code += n
-		code <<= 1
 	}
 
 	for i, n := range bits {
@@ -105,17 +122,60 @@ func (h *huffmanDecoder) init(bits []int) bool {
 		reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
 		reverse >>= uint(16 - n)
 		if n <= huffmanChunkBits {
-			for off := reverse; off < huffmanNumChunks; off += 1 << uint(n) {
+			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
+				// We should never need to overwrite
+				// an existing chunk.  Also, 0 is
+				// never a valid chunk, because the
+				// lower 4 "count" bits should be
+				// between 1 and 15.
+				if sanity && h.chunks[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
 				h.chunks[off] = chunk
 			}
 		} else {
-			linktab := h.links[h.chunks[reverse&(huffmanNumChunks-1)]>>huffmanValueShift]
+			j := reverse & (huffmanNumChunks - 1)
+			if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
+				// Longer codes should have been
+				// associated with a link table above.
+				panic("impossible: not an indirect chunk")
+			}
+			value := h.chunks[j] >> huffmanValueShift
+			linktab := h.links[value]
 			reverse >>= huffmanChunkBits
-			for off := reverse; off < numLinks; off += 1 << uint(n-huffmanChunkBits) {
+			for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
+				if sanity && linktab[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
 				linktab[off] = chunk
 			}
 		}
 	}
+
+	if sanity {
+		// Above we've sanity checked that we never overwrote
+		// an existing entry.  Here we additionally check that
+		// we filled the tables completely.
+		for i, chunk := range h.chunks {
+			if chunk == 0 {
+				// As an exception, in the degenerate
+				// single-code case, we allow odd
+				// chunks to be missing.
+				if code == 1 && i%2 == 1 {
+					continue
+				}
+				panic("impossible: missing chunk")
+			}
+		}
+		for _, linktab := range h.links {
+			for _, chunk := range linktab {
+				if chunk == 0 {
+					panic("impossible: missing chunk")
+				}
+			}
+		}
+	}
+
 	return true
 }
 
@@ -138,6 +198,9 @@ func main() {
 		bits[i] = 8
 	}
 	h.init(bits[:])
+	if h.links != nil {
+		log.Fatal("Unexpected links table in fixed Huffman decoder")
+	}
 
 	var buf bytes.Buffer
 
diff --git a/src/compress/flate/inflate.go b/src/compress/flate/inflate.go
index 291e62343e..6f88159dfa 100644
--- a/src/compress/flate/inflate.go
+++ b/src/compress/flate/inflate.go
@@ -105,9 +105,6 @@ func (h *huffmanDecoder) init(bits []int) bool {
 	// Sanity enables additional runtime tests during Huffman
 	// table construction.  It's intended to be used during
 	// development to supplement the currently ad-hoc unit tests.
-	//
-	// TODO(mdempsky): TestIssue5962 and TestIssue6255 currently
-	// fail with these enabled.
 	const sanity = false
 
 	if h.min != 0 {
@@ -134,39 +131,41 @@ func (h *huffmanDecoder) init(bits []int) bool {
 		return false
 	}
 
-	h.min = min
-	var linkBits uint
-	var numLinks int
-	if max > huffmanChunkBits {
-		linkBits = uint(max) - huffmanChunkBits
-		numLinks = 1 << linkBits
-		h.linkMask = uint32(numLinks - 1)
-	}
 	code := 0
 	var nextcode [maxCodeLen]int
 	for i := min; i <= max; i++ {
-		if i == huffmanChunkBits+1 {
-			// create link tables
-			link := code >> 1
-			if huffmanNumChunks < link {
-				return false
-			}
-			h.links = make([][]uint32, huffmanNumChunks-link)
-			for j := uint(link); j < huffmanNumChunks; j++ {
-				reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
-				reverse >>= uint(16 - huffmanChunkBits)
-				off := j - uint(link)
-				if sanity && h.chunks[reverse] != 0 {
-					panic("impossible: overwriting existing chunk")
-				}
-				h.chunks[reverse] = uint32(off<<huffmanValueShift + uint(i))
-				h.links[off] = make([]uint32, 1<<linkBits)
+		code <<= 1
+		nextcode[i] = code
+		code += count[i]
+	}
+
+	// Check that the coding is complete (i.e., that we've
+	// assigned all 2-to-the-max possible bit sequences).
+	// Exception: To be compatible with zlib, we also need to
+	// accept degenerate single-code codings.  See also
+	// TestDegenerateHuffmanCoding.
+	if code != 1<<uint(max) && !(code == 1 && max == 1) {
+		return false
+	}
+
+	h.min = min
+	if max > huffmanChunkBits {
+		numLinks := 1 << (uint(max) - huffmanChunkBits)
+		h.linkMask = uint32(numLinks - 1)
+
+		// create link tables
+		link := nextcode[huffmanChunkBits+1] >> 1
+		h.links = make([][]uint32, huffmanNumChunks-link)
+		for j := uint(link); j < huffmanNumChunks; j++ {
+			reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
+			reverse >>= uint(16 - huffmanChunkBits)
+			off := j - uint(link)
+			if sanity && h.chunks[reverse] != 0 {
+				panic("impossible: overwriting existing chunk")
 			}
+			h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			h.links[off] = make([]uint32, numLinks)
 		}
-		n := count[i]
-		nextcode[i] = code
-		code += n
-		code <<= 1
 	}
 
 	for i, n := range bits {
@@ -179,7 +178,7 @@ func (h *huffmanDecoder) init(bits []int) bool {
 		reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
 		reverse >>= uint(16 - n)
 		if n <= huffmanChunkBits {
-			for off := reverse; off < huffmanNumChunks; off += 1 << uint(n) {
+			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
 				// We should never need to overwrite
 				// an existing chunk.  Also, 0 is
 				// never a valid chunk, because the
@@ -198,12 +197,9 @@ func (h *huffmanDecoder) init(bits []int) bool {
 				panic("impossible: not an indirect chunk")
 			}
 			value := h.chunks[j] >> huffmanValueShift
-			if value >= uint32(len(h.links)) {
-				return false
-			}
 			linktab := h.links[value]
 			reverse >>= huffmanChunkBits
-			for off := reverse; off < numLinks; off += 1 << uint(n-huffmanChunkBits) {
+			for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
 				if sanity && linktab[off] != 0 {
 					panic("impossible: overwriting existing chunk")
 				}
-- 
2.50.0