From 53efe1e121e68feadcce00d1f7b1978dbbd1d1c7 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Mon, 21 Mar 2016 10:21:55 +0100
Subject: [PATCH] compress/flate: rework matching algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This changes how matching is done in deflate algorithm.

The major change is that we do not look for matches that are only
3 bytes in length, matches must be 4 bytes at least.
Contrary to what you would expect this actually improves the
compresion ratio, since 3 literal bytes will often be shorter
than a match after huffman encoding.
This varies a bit by source, but is most often the case when the
source is "easy" to compress.

Second of all, a "stronger" hash is used. The hash is similar to
the hashing function used by Snappy.

Overall, the speed impact is biggest on higher compression levels.
I intend to replace the "speed" compression level, which can be
seen in CL 21021.

The built-in benchmark using "digits" is slower at level 1.
I see this as an exception, since "digits" is a special type
of data, where you have low entropy (numbers 0->9), but no
significant matches. Again, CL 20021 fixes that case.

NewWriterDict is also made considerably faster, by not running data
through the entire encoder. This is not reflected by the benchmark.

Overall, the speed impact is biggest on higher compression levels.
I intend to replace the "speed" compression level.

COMPARED to tip/master:
name                       old time/op    new time/op     delta
EncodeDigitsSpeed1e4-4        401Âµs Â± 1%      345Âµs Â± 2%   -13.95%
EncodeDigitsSpeed1e5-4       3.19ms Â± 1%     4.27ms Â± 3%   +33.96%
EncodeDigitsSpeed1e6-4       27.7ms Â± 4%     43.8ms Â± 3%   +58.00%
EncodeDigitsDefault1e4-4      641Âµs Â± 0%      403Âµs Â± 1%   -37.15%
EncodeDigitsDefault1e5-4     13.8ms Â± 1%      6.4ms Â± 3%   -53.73%
EncodeDigitsDefault1e6-4      162ms Â± 1%       64ms Â± 2%   -60.51%
EncodeDigitsCompress1e4-4     627Âµs Â± 1%      405Âµs Â± 2%   -35.45%
EncodeDigitsCompress1e5-4    13.9ms Â± 0%      6.3ms Â± 2%   -54.46%
EncodeDigitsCompress1e6-4     159ms Â± 1%       64ms Â± 0%   -59.91%
EncodeTwainSpeed1e4-4         433Âµs Â± 4%      331Âµs Â± 1%   -23.53%
EncodeTwainSpeed1e5-4        2.82ms Â± 1%     3.08ms Â± 0%    +9.10%
EncodeTwainSpeed1e6-4        28.1ms Â± 2%     28.8ms Â± 0%    +2.82%
EncodeTwainDefault1e4-4       695Âµs Â± 4%      474Âµs Â± 1%   -31.78%
EncodeTwainDefault1e5-4      11.8ms Â± 0%      7.4ms Â± 0%   -37.31%
EncodeTwainDefault1e6-4       128ms Â± 0%       75ms Â± 0%   -40.93%
EncodeTwainCompress1e4-4      719Âµs Â± 3%      480Âµs Â± 0%   -33.27%
EncodeTwainCompress1e5-4     15.0ms Â± 3%      8.2ms Â± 2%   -45.55%
EncodeTwainCompress1e6-4      170ms Â± 0%       85ms Â± 1%   -49.99%

name                       old speed      new speed       delta
EncodeDigitsSpeed1e4-4     25.0MB/s Â± 1%   29.0MB/s Â± 2%   +16.24%
EncodeDigitsSpeed1e5-4     31.4MB/s Â± 1%   23.4MB/s Â± 3%   -25.34%
EncodeDigitsSpeed1e6-4     36.1MB/s Â± 4%   22.8MB/s Â± 3%   -36.74%
EncodeDigitsDefault1e4-4   15.6MB/s Â± 0%   24.8MB/s Â± 1%   +59.11%
EncodeDigitsDefault1e5-4   7.27MB/s Â± 1%  15.72MB/s Â± 3%  +116.23%
EncodeDigitsDefault1e6-4   6.16MB/s Â± 0%  15.60MB/s Â± 2%  +153.25%
EncodeDigitsCompress1e4-4  15.9MB/s Â± 1%   24.7MB/s Â± 2%   +54.97%
EncodeDigitsCompress1e5-4  7.19MB/s Â± 0%  15.78MB/s Â± 2%  +119.62%
EncodeDigitsCompress1e6-4  6.27MB/s Â± 1%  15.65MB/s Â± 0%  +149.52%
EncodeTwainSpeed1e4-4      23.1MB/s Â± 4%   30.2MB/s Â± 1%   +30.68%
EncodeTwainSpeed1e5-4      35.4MB/s Â± 1%   32.5MB/s Â± 0%    -8.34%
EncodeTwainSpeed1e6-4      35.6MB/s Â± 2%   34.7MB/s Â± 0%    -2.77%
EncodeTwainDefault1e4-4    14.4MB/s Â± 4%   21.1MB/s Â± 1%   +46.48%
EncodeTwainDefault1e5-4    8.49MB/s Â± 0%  13.55MB/s Â± 0%   +59.50%
EncodeTwainDefault1e6-4    7.83MB/s Â± 0%  13.25MB/s Â± 0%   +69.19%
EncodeTwainCompress1e4-4   13.9MB/s Â± 3%   20.8MB/s Â± 0%   +49.83%
EncodeTwainCompress1e5-4   6.65MB/s Â± 3%  12.20MB/s Â± 2%   +83.51%
EncodeTwainCompress1e6-4   5.88MB/s Â± 0%  11.76MB/s Â± 1%  +100.06%

Change-Id: I724e33c1dd3e3a6a1b0a68e094baa959352baf32
Reviewed-on: https://go-review.googlesource.com/20929
Run-TryBot: Nigel Tao <nigeltao@golang.org>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
---
 src/compress/flate/deflate.go      | 237 ++++++++++++++++++-----------
 src/compress/flate/deflate_test.go |  63 +++++++-
 2 files changed, 210 insertions(+), 90 deletions(-)

diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go
index 199fc4cf3c..428f2508d3 100644
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -13,14 +13,13 @@ import (
 const (
 	NoCompression      = 0
 	BestSpeed          = 1
-	fastCompression    = 3
 	BestCompression    = 9
 	DefaultCompression = -1
 	logWindowSize      = 15
 	windowSize         = 1 << logWindowSize
 	windowMask         = windowSize - 1
 	logMaxOffsetSize   = 15  // Standard DEFLATE
-	minMatchLength     = 3   // The smallest match that the compressor looks for
+	minMatchLength     = 4   // The smallest match that the compressor looks for
 	maxMatchLength     = 258 // The longest match for the compressor
 	minOffsetSize      = 1   // The shortest offset that makes any sense
 
@@ -28,39 +27,39 @@ const (
 	// stop things from getting too large.
 	maxFlateBlockTokens = 1 << 14
 	maxStoreBlockSize   = 65535
-	hashBits            = 17
+	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
 	hashMask            = (1 << hashBits) - 1
-	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
 	maxHashOffset       = 1 << 24
 
 	skipNever = math.MaxInt32
 )
 
 type compressionLevel struct {
-	good, lazy, nice, chain, fastSkipHashing int
+	level, good, lazy, nice, chain, fastSkipHashing int
 }
 
 var levels = []compressionLevel{
 	{}, // 0
 	// For levels 1-3 we don't bother trying with lazy matches
-	{3, 0, 8, 4, 4},
-	{3, 0, 16, 8, 5},
-	{3, 0, 32, 32, 6},
+	{1, 4, 0, 8, 4, 4},
+	{2, 4, 0, 16, 8, 5},
+	{3, 4, 0, 32, 32, 6},
 	// Levels 4-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{4, 4, 16, 16, skipNever},
-	{8, 16, 32, 32, skipNever},
-	{8, 16, 128, 128, skipNever},
-	{8, 32, 128, 256, skipNever},
-	{32, 128, 258, 1024, skipNever},
-	{32, 258, 258, 4096, skipNever},
+	{4, 4, 4, 16, 16, skipNever},
+	{5, 8, 16, 32, 32, skipNever},
+	{6, 8, 16, 128, 128, skipNever},
+	{7, 8, 32, 128, 256, skipNever},
+	{8, 32, 128, 258, 1024, skipNever},
+	{9, 32, 258, 258, 4096, skipNever},
 }
 
 type compressor struct {
 	compressionLevel
 
-	w *huffmanBitWriter
+	w          *huffmanBitWriter
+	bulkHasher func([]byte, []uint32)
 
 	// compression algorithm
 	fill func(*compressor, []byte) int // copy data to window
@@ -73,8 +72,8 @@ type compressor struct {
 	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
 	// with the same hash value.
 	chainHead  int
-	hashHead   []int
-	hashPrev   []int
+	hashHead   []uint32
+	hashPrev   []uint32
 	hashOffset int
 
 	// input window: unprocessed data is window[index:windowEnd]
@@ -90,9 +89,12 @@ type compressor struct {
 	// deflate state
 	length         int
 	offset         int
-	hash           int
+	hash           uint32
 	maxInsertIndex int
 	err            error
+
+	// hashMatch must be able to contain hashes for the maximum match length.
+	hashMatch [maxMatchLength - 1]uint32
 }
 
 func (d *compressor) fillDeflate(b []byte) int {
@@ -112,15 +114,15 @@ func (d *compressor) fillDeflate(b []byte) int {
 			d.hashOffset -= delta
 			d.chainHead -= delta
 			for i, v := range d.hashPrev {
-				if v > delta {
-					d.hashPrev[i] -= delta
+				if int(v) > delta {
+					d.hashPrev[i] = uint32(int(v) - delta)
 				} else {
 					d.hashPrev[i] = 0
 				}
 			}
 			for i, v := range d.hashHead {
-				if v > delta {
-					d.hashHead[i] -= delta
+				if int(v) > delta {
+					d.hashHead[i] = uint32(int(v) - delta)
 				} else {
 					d.hashHead[i] = 0
 				}
@@ -132,19 +134,73 @@ func (d *compressor) fillDeflate(b []byte) int {
 	return n
 }
 
-func (d *compressor) writeBlock(tokens []token, index int, eof bool) error {
-	if index > 0 || eof {
+func (d *compressor) writeBlock(tokens []token, index int) error {
+	if index > 0 {
 		var window []byte
 		if d.blockStart <= index {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tokens, eof, window)
+		d.w.writeBlock(tokens, false, window)
 		return d.w.err
 	}
 	return nil
 }
 
+// fillWindow will fill the current window with the supplied
+// dictionary and calculate all hashes.
+// This is much faster than doing a full encode.
+// Should only be used after a reset.
+func (d *compressor) fillWindow(b []byte) {
+	// Do not fill window if we are in store-only mode.
+	if d.compressionLevel.level == 0 {
+		return
+	}
+	if d.index != 0 || d.windowEnd != 0 {
+		panic("internal error: fillWindow called with stale data")
+	}
+
+	// If we are given too much, cut it.
+	if len(b) > windowSize {
+		b = b[len(b)-windowSize:]
+	}
+	// Add all to window.
+	n := copy(d.window, b)
+
+	// Calculate 256 hashes at the time (more L1 cache hits)
+	loops := (n + 256 - minMatchLength) / 256
+	for j := 0; j < loops; j++ {
+		index := j * 256
+		end := index + 256 + minMatchLength - 1
+		if end > n {
+			end = n
+		}
+		toCheck := d.window[index:end]
+		dstSize := len(toCheck) - minMatchLength + 1
+
+		if dstSize <= 0 {
+			continue
+		}
+
+		dst := d.hashMatch[:dstSize]
+		d.bulkHasher(toCheck, dst)
+		var newH uint32
+		for i, val := range dst {
+			di := i + index
+			newH = val & hashMask
+			// Get previous value with the same hash.
+			// Our chain should point to the previous value.
+			d.hashPrev[di&windowMask] = d.hashHead[newH]
+			// Set the head of the hash chain to us.
+			d.hashHead[newH] = uint32(di + d.hashOffset)
+		}
+		d.hash = newH
+	}
+	// Update window information.
+	d.windowEnd = n
+	d.index = n
+}
+
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
 func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
@@ -168,20 +224,15 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 		tries >>= 2
 	}
 
-	w0 := win[pos]
-	w1 := win[pos+1]
 	wEnd := win[pos+length]
+	wPos := win[pos:]
 	minIndex := pos - windowSize
 
 	for i := prevHead; tries > 0; tries-- {
-		if w0 == win[i] && w1 == win[i+1] && wEnd == win[i+length] {
-			// The hash function ensures that if win[i] and win[i+1] match, win[i+2] matches
+		if wEnd == win[i+length] {
+			n := matchLen(win[i:], wPos, minMatchLook)
 
-			n := 3
-			for pos+n < len(win) && win[i+n] == win[pos+n] {
-				n++
-			}
-			if n > length && (n > 3 || pos-i <= 4096) {
+			if n > length && (n > minMatchLength || pos-i <= 4096) {
 				length = n
 				offset = pos - i
 				ok = true
@@ -196,7 +247,8 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 			// hashPrev[i & windowMask] has already been overwritten, so stop now.
 			break
 		}
-		if i = d.hashPrev[i&windowMask] - d.hashOffset; i < minIndex || i < 0 {
+		i = int(d.hashPrev[i&windowMask]) - d.hashOffset
+		if i < minIndex || i < 0 {
 			break
 		}
 	}
@@ -211,9 +263,46 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
 	return d.w.err
 }
 
+const hashmul = 0x1e35a7bd
+
+// hash4 returns a hash representation of the first 4 bytes
+// of the supplied slice.
+// The caller must ensure that len(b) >= 4.
+func hash4(b []byte) uint32 {
+	return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
+}
+
+// bulkHash4 will compute hashes using the same
+// algorithm as hash4
+func bulkHash4(b []byte, dst []uint32) {
+	if len(b) < minMatchLength {
+		return
+	}
+	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	dst[0] = (hb * hashmul) >> (32 - hashBits)
+	end := len(b) - minMatchLength + 1
+	for i := 1; i < end; i++ {
+		hb = (hb << 8) | uint32(b[i+3])
+		dst[i] = (hb * hashmul) >> (32 - hashBits)
+	}
+}
+
+// matchLen returns the number of matching bytes in a and b
+// up to length 'max'. Both slices must be at least 'max'
+// bytes in size.
+func matchLen(a, b []byte, max int) int {
+	a = a[:max]
+	for i, av := range a {
+		if b[i] != av {
+			return i
+		}
+	}
+	return max
+}
+
 func (d *compressor) initDeflate() {
-	d.hashHead = make([]int, hashSize)
-	d.hashPrev = make([]int, windowSize)
+	d.hashHead = make([]uint32, hashSize)
+	d.hashPrev = make([]uint32, windowSize)
 	d.window = make([]byte, 2*windowSize)
 	d.hashOffset = 1
 	d.tokens = make([]token, 0, maxFlateBlockTokens+1)
@@ -223,6 +312,7 @@ func (d *compressor) initDeflate() {
 	d.index = 0
 	d.hash = 0
 	d.chainHead = -1
+	d.bulkHasher = bulkHash4
 }
 
 func (d *compressor) deflate() {
@@ -232,7 +322,7 @@ func (d *compressor) deflate() {
 
 	d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
 	if d.index < d.maxInsertIndex {
-		d.hash = int(d.window[d.index])<<hashShift + int(d.window[d.index+1])
+		d.hash = hash4(d.window[d.index : d.index+minMatchLength])
 	}
 
 Loop:
@@ -256,7 +346,7 @@ Loop:
 					d.byteAvailable = false
 				}
 				if len(d.tokens) > 0 {
-					if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+					if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
 						return
 					}
 					d.tokens = d.tokens[:0]
@@ -266,10 +356,10 @@ Loop:
 		}
 		if d.index < d.maxInsertIndex {
 			// Update the hash
-			d.hash = (d.hash<<hashShift + int(d.window[d.index+2])) & hashMask
-			d.chainHead = d.hashHead[d.hash]
-			d.hashPrev[d.index&windowMask] = d.chainHead
-			d.hashHead[d.hash] = d.index + d.hashOffset
+			d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+			d.chainHead = int(d.hashHead[d.hash])
+			d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
+			d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
 		}
 		prevLength := d.length
 		prevOffset := d.offset
@@ -293,9 +383,9 @@ Loop:
 			// There was a match at the previous step, and the current match is
 			// not better. Output the previous match.
 			if d.fastSkipHashing != skipNever {
-				d.tokens = append(d.tokens, matchToken(uint32(d.length-minMatchLength), uint32(d.offset-minOffsetSize)))
+				d.tokens = append(d.tokens, matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize)))
 			} else {
-				d.tokens = append(d.tokens, matchToken(uint32(prevLength-minMatchLength), uint32(prevOffset-minOffsetSize)))
+				d.tokens = append(d.tokens, matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)))
 			}
 			// Insert in the hash table all strings up to the end of the match.
 			// index and index-1 are already inserted. If there is not enough
@@ -310,12 +400,12 @@ Loop:
 				}
 				for d.index++; d.index < newIndex; d.index++ {
 					if d.index < d.maxInsertIndex {
-						d.hash = (d.hash<<hashShift + int(d.window[d.index+2])) & hashMask
+						d.hash = hash4(d.window[d.index : d.index+minMatchLength])
 						// Get previous value with the same hash.
 						// Our chain should point to the previous value.
 						d.hashPrev[d.index&windowMask] = d.hashHead[d.hash]
 						// Set the head of the hash chain to us.
-						d.hashHead[d.hash] = d.index + d.hashOffset
+						d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
 					}
 				}
 				if d.fastSkipHashing == skipNever {
@@ -327,12 +417,12 @@ Loop:
 				// item into the table.
 				d.index += d.length
 				if d.index < d.maxInsertIndex {
-					d.hash = (int(d.window[d.index])<<hashShift + int(d.window[d.index+1]))
+					d.hash = hash4(d.window[d.index : d.index+minMatchLength])
 				}
 			}
 			if len(d.tokens) == maxFlateBlockTokens {
 				// The block includes the current character
-				if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+				if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
 					return
 				}
 				d.tokens = d.tokens[:0]
@@ -345,7 +435,7 @@ Loop:
 				}
 				d.tokens = append(d.tokens, literalToken(uint32(d.window[i])))
 				if len(d.tokens) == maxFlateBlockTokens {
-					if d.err = d.writeBlock(d.tokens, i+1, false); d.err != nil {
+					if d.err = d.writeBlock(d.tokens, i+1); d.err != nil {
 						return
 					}
 					d.tokens = d.tokens[:0]
@@ -377,7 +467,6 @@ func (d *compressor) write(b []byte) (n int, err error) {
 		return 0, d.err
 	}
 	n = len(b)
-	b = b[d.fill(d, b):]
 	for len(b) > 0 {
 		d.step(d)
 		b = b[d.fill(d, b):]
@@ -425,42 +514,29 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 	return nil
 }
 
-var zeroes [32]int
-var bzeroes [256]byte
+// hzeroes is used for zeroing the hash slice.
+var hzeroes [256]uint32
 
 func (d *compressor) reset(w io.Writer) {
 	d.w.reset(w)
 	d.sync = false
 	d.err = nil
-	switch d.compressionLevel.chain {
-	case 0:
-		// level was NoCompression.
-		for i := range d.window {
-			d.window[i] = 0
-		}
+	switch d.compressionLevel.level {
+	case NoCompression:
 		d.windowEnd = 0
 	default:
 		d.chainHead = -1
 		for s := d.hashHead; len(s) > 0; {
-			n := copy(s, zeroes[:])
+			n := copy(s, hzeroes[:])
 			s = s[n:]
 		}
-		for s := d.hashPrev; len(s) > 0; s = s[len(zeroes):] {
-			copy(s, zeroes[:])
+		for s := d.hashPrev; len(s) > 0; s = s[len(hzeroes):] {
+			copy(s, hzeroes[:])
 		}
 		d.hashOffset = 1
 
 		d.index, d.windowEnd = 0, 0
-		for s := d.window; len(s) > 0; {
-			n := copy(s, bzeroes[:])
-			s = s[n:]
-		}
 		d.blockStart, d.byteAvailable = 0, false
-
-		d.tokens = d.tokens[:maxFlateBlockTokens+1]
-		for i := 0; i <= maxFlateBlockTokens; i++ {
-			d.tokens[i] = 0
-		}
 		d.tokens = d.tokens[:0]
 		d.length = minMatchLength - 1
 		d.offset = 0
@@ -509,28 +585,22 @@ func NewWriter(w io.Writer, level int) (*Writer, error) {
 // can only be decompressed by a Reader initialized with the
 // same dictionary.
 func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
-	dw := &dictWriter{w, false}
+	dw := &dictWriter{w}
 	zw, err := NewWriter(dw, level)
 	if err != nil {
 		return nil, err
 	}
-	zw.Write(dict)
-	zw.Flush()
-	dw.enabled = true
+	zw.d.fillWindow(dict)
 	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
 	return zw, err
 }
 
 type dictWriter struct {
-	w       io.Writer
-	enabled bool
+	w io.Writer
 }
 
 func (w *dictWriter) Write(b []byte) (n int, err error) {
-	if w.enabled {
-		return w.w.Write(b)
-	}
-	return len(b), nil
+	return w.w.Write(b)
 }
 
 // A Writer takes data written to it and writes the compressed
@@ -572,10 +642,7 @@ func (w *Writer) Reset(dst io.Writer) {
 		// w was created with NewWriterDict
 		dw.w = dst
 		w.d.reset(dw)
-		dw.enabled = false
-		w.Write(w.dict)
-		w.Flush()
-		dw.enabled = true
+		w.d.fillWindow(w.dict)
 	} else {
 		// w was created with NewWriter
 		w.d.reset(dst)
diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go
index c165ade734..6201213f01 100644
--- a/src/compress/flate/deflate_test.go
+++ b/src/compress/flate/deflate_test.go
@@ -80,6 +80,32 @@ func largeDataChunk() []byte {
 	return result
 }
 
+func TestBulkHash4(t *testing.T) {
+	for _, x := range deflateTests {
+		y := x.out
+		if len(y) < minMatchLength {
+			continue
+		}
+		y = append(y, y...)
+		for j := 4; j < len(y); j++ {
+			y := y[:j]
+			dst := make([]uint32, len(y)-minMatchLength+1)
+			for i := range dst {
+				dst[i] = uint32(i + 100)
+			}
+			bulkHash4(y, dst)
+			for i, got := range dst {
+				want := hash4(y[i:])
+				if got != want && got == uint32(i)+100 {
+					t.Errorf("Len:%d Index:%d, want 0x%08x but not modified", len(y), i, want)
+				} else if got != want {
+					t.Errorf("Len:%d Index:%d, got 0x%08x want:0x%08x", len(y), i, got, want)
+				}
+			}
+		}
+	}
+}
+
 func TestDeflate(t *testing.T) {
 	for _, h := range deflateTests {
 		var buf bytes.Buffer
@@ -91,7 +117,7 @@ func TestDeflate(t *testing.T) {
 		w.Write(h.in)
 		w.Close()
 		if !bytes.Equal(buf.Bytes(), h.out) {
-			t.Errorf("Deflate(%d, %x) = %x, want %x", h.level, h.in, buf.Bytes(), h.out)
+			t.Errorf("Deflate(%d, %x) = \n%#v, want \n%#v", h.level, h.in, buf.Bytes(), h.out)
 		}
 	}
 }
@@ -289,6 +315,9 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
 		t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit)
 		return
 	}
+	if limit > 0 {
+		t.Logf("level: %d, size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len())
+	}
 	r := NewReader(&buffer)
 	out, err := ioutil.ReadAll(r)
 	if err != nil {
@@ -457,6 +486,17 @@ func TestWriterReset(t *testing.T) {
 		// DeepEqual doesn't compare functions.
 		w.d.fill, wref.d.fill = nil, nil
 		w.d.step, wref.d.step = nil, nil
+		w.d.bulkHasher, wref.d.bulkHasher = nil, nil
+		// hashMatch is always overwritten when used.
+		copy(w.d.hashMatch[:], wref.d.hashMatch[:])
+		if len(w.d.tokens) != 0 {
+			t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, len(w.d.tokens))
+		}
+		// As long as the length is 0, we don't care about the content.
+		w.d.tokens = wref.d.tokens
+
+		// We don't care if there are values in the window, as long as it is at d.index is 0
+		w.d.window = wref.d.window
 		if !reflect.DeepEqual(w, wref) {
 			t.Errorf("level %d Writer not reset after Reset", level)
 		}
@@ -481,7 +521,7 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error))
 		w.Write(b)
 	}
 	w.Close()
-	out1 := buf.String()
+	out1 := buf.Bytes()
 
 	buf2 := new(bytes.Buffer)
 	w.Reset(buf2)
@@ -489,10 +529,23 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error))
 		w.Write(b)
 	}
 	w.Close()
-	out2 := buf2.String()
+	out2 := buf2.Bytes()
 
-	if out1 != out2 {
-		t.Errorf("got %q, expected %q", out2, out1)
+	if len(out1) != len(out2) {
+		t.Errorf("got %d, expected %d bytes", len(out2), len(out1))
+		return
+	}
+	if !bytes.Equal(out1, out2) {
+		mm := 0
+		for i, b := range out1[:len(out2)] {
+			if b != out2[i] {
+				t.Errorf("mismatch index %d: %#02x, expected %#02x", i, out2[i], b)
+			}
+			mm++
+			if mm == 10 {
+				t.Fatal("Stopping")
+			}
+		}
 	}
 	t.Logf("got %d bytes", len(out1))
 }
-- 
2.51.0