From 2e196b15b94e48009369aae25299a59cbc7d45f4 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sat, 22 Oct 2016 18:50:02 +0200
Subject: [PATCH] compress/flate: level 1 (best speed) match across blocks

This change makes deflate level 1 (best speed) match across
block boundaries. This comes at a small speed penalty,
but improves compression on almost all output.

Sample numbers on various content types:

enwik9:            391052014 ->  382578469 bytes, 77.59 -> 74.28 MB/s
adresser.001:       57269799 ->   47756095 bytes, 287.84 -> 357.86 MB/s
10gb:             5233055166 -> 5198328382 bytes, 105.85 -> 96.99 MB/s
rawstudio-mint14: 3972329211 -> 3927423364 bytes, 100.07 -> 94.22 MB/s
sites:             165556800 ->  163178702 bytes, 72.31 -> 70.15 MB/s
objectfiles:       115962472 ->  111649524 bytes, 132.60 -> 128.05 MB/s
sharnd.out:        200015283 ->  200015283 bytes, 221.50 -> 218.83 MB/s

Change-Id: I62a139e5c06976e803439a4268acede5139b8cfc
Reviewed-on: https://go-review.googlesource.com/31640
Reviewed-by: Joe Tsai <thebrokentoaster@gmail.com>
Reviewed-by: Nigel Tao <nigeltao@golang.org>
---
 src/compress/flate/deflate.go      |  12 +-
 src/compress/flate/deflate_test.go | 117 ++++++++++++++++++
 src/compress/flate/deflatefast.go  | 186 ++++++++++++++++++++++-------
 3 files changed, 267 insertions(+), 48 deletions(-)

diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go
index ccf6d527d8..7a805235d2 100644
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -84,9 +84,10 @@ type compressor struct {
 	bulkHasher func([]byte, []uint32)
 
 	// compression algorithm
-	fill func(*compressor, []byte) int // copy data to window
-	step func(*compressor)             // process window
-	sync bool                          // requesting flush
+	fill      func(*compressor, []byte) int // copy data to window
+	step      func(*compressor)             // process window
+	sync      bool                          // requesting flush
+	bestSpeed *deflateFast                  // Encoder for BestSpeed
 
 	// Input hash chains
 	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
@@ -346,12 +347,13 @@ func (d *compressor) encSpeed() {
 				d.err = d.w.err
 			}
 			d.windowEnd = 0
+			d.bestSpeed.reset()
 			return
 		}
 
 	}
 	// Encode the block.
-	d.tokens = encodeBestSpeed(d.tokens[:0], d.window[:d.windowEnd])
+	d.tokens = d.bestSpeed.encode(d.tokens[:0], d.window[:d.windowEnd])
 
 	// If we removed less than 1/16th, Huffman compress the block.
 	if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) {
@@ -584,6 +586,7 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillStore
 		d.step = (*compressor).encSpeed
+		d.bestSpeed = newDeflateFast()
 		d.tokens = make([]token, maxStoreBlockSize)
 	case level == DefaultCompression:
 		level = 6
@@ -609,6 +612,7 @@ func (d *compressor) reset(w io.Writer) {
 	case BestSpeed:
 		d.windowEnd = 0
 		d.tokens = d.tokens[:0]
+		d.bestSpeed.reset()
 	default:
 		d.chainHead = -1
 		for i := range d.hashHead {
diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go
index 3322c40845..415e97262f 100644
--- a/src/compress/flate/deflate_test.go
+++ b/src/compress/flate/deflate_test.go
@@ -490,6 +490,7 @@ func TestWriterReset(t *testing.T) {
 		w.d.fill, wref.d.fill = nil, nil
 		w.d.step, wref.d.step = nil, nil
 		w.d.bulkHasher, wref.d.bulkHasher = nil, nil
+		w.d.bestSpeed, wref.d.bestSpeed = nil, nil
 		// hashMatch is always overwritten when used.
 		copy(w.d.hashMatch[:], wref.d.hashMatch[:])
 		if len(w.d.tokens) != 0 {
@@ -681,3 +682,119 @@ func TestWriterPersistentError(t *testing.T) {
 		}
 	}
 }
+
+func TestBestSpeedMatch(t *testing.T) {
+	cases := []struct {
+		previous, current []byte
+		t, s, want        int32
+	}{{
+		previous: []byte{0, 0, 0, 1, 2},
+		current:  []byte{3, 4, 5, 0, 1, 2, 3, 4, 5},
+		t:        -3,
+		s:        3,
+		want:     6,
+	}, {
+		previous: []byte{0, 0, 0, 1, 2},
+		current:  []byte{2, 4, 5, 0, 1, 2, 3, 4, 5},
+		t:        -3,
+		s:        3,
+		want:     3,
+	}, {
+		previous: []byte{0, 0, 0, 1, 1},
+		current:  []byte{3, 4, 5, 0, 1, 2, 3, 4, 5},
+		t:        -3,
+		s:        3,
+		want:     2,
+	}, {
+		previous: []byte{0, 0, 0, 1, 2},
+		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
+		t:        -1,
+		s:        0,
+		want:     4,
+	}, {
+		previous: []byte{0, 0, 0, 1, 2, 3, 4, 5, 2, 2},
+		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
+		t:        -7,
+		s:        4,
+		want:     5,
+	}, {
+		previous: []byte{9, 9, 9, 9, 9},
+		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
+		t:        -1,
+		s:        0,
+		want:     0,
+	}, {
+		previous: []byte{9, 9, 9, 9, 9},
+		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
+		t:        0,
+		s:        1,
+		want:     0,
+	}, {
+		previous: []byte{},
+		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
+		t:        -5,
+		s:        1,
+		want:     0,
+	}, {
+		previous: []byte{},
+		current:  []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
+		t:        -1,
+		s:        1,
+		want:     0,
+	}, {
+		previous: []byte{},
+		current:  []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
+		t:        0,
+		s:        1,
+		want:     3,
+	}, {
+		previous: []byte{3, 4, 5},
+		current:  []byte{3, 4, 5},
+		t:        -3,
+		s:        0,
+		want:     3,
+	}, {
+		previous: make([]byte, 1000),
+		current:  make([]byte, 1000),
+		t:        -1000,
+		s:        0,
+		want:     maxMatchLength - 4,
+	}, {
+		previous: make([]byte, 200),
+		current:  make([]byte, 500),
+		t:        -200,
+		s:        0,
+		want:     maxMatchLength - 4,
+	}, {
+		previous: make([]byte, 200),
+		current:  make([]byte, 500),
+		t:        0,
+		s:        1,
+		want:     maxMatchLength - 4,
+	}, {
+		previous: make([]byte, maxMatchLength-4),
+		current:  make([]byte, 500),
+		t:        -(maxMatchLength - 4),
+		s:        0,
+		want:     maxMatchLength - 4,
+	}, {
+		previous: make([]byte, 200),
+		current:  make([]byte, 500),
+		t:        -200,
+		s:        400,
+		want:     100,
+	}, {
+		previous: make([]byte, 10),
+		current:  make([]byte, 500),
+		t:        200,
+		s:        400,
+		want:     100,
+	}}
+	for i, c := range cases {
+		e := deflateFast{prev: c.previous}
+		got := e.matchLen(c.s, c.t, c.current)
+		if got != c.want {
+			t.Errorf("Test %d: match length, want %d, got %d", i, c.want, got)
+		}
+	}
+}
diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go
index 6b881a477c..5201b2ee1c 100644
--- a/src/compress/flate/deflatefast.go
+++ b/src/compress/flate/deflatefast.go
@@ -14,12 +14,12 @@ const (
 	tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
 )
 
-func load32(b []byte, i int) uint32 {
+func load32(b []byte, i int32) uint32 {
 	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
 	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 }
 
-func load64(b []byte, i int) uint64 {
+func load64(b []byte, i int32) uint64 {
 	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
 	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
@@ -38,31 +38,49 @@ const (
 	minNonLiteralBlockSize = 1 + 1 + inputMargin
 )
 
-func encodeBestSpeed(dst []token, src []byte) []token {
+type tableEntry struct {
+	val    uint32 // Value at destination
+	offset int32
+}
+
+// deflateFast maintains the table for matches,
+// and the previous byte block for cross block matching.
+type deflateFast struct {
+	table [tableSize]tableEntry
+	prev  []byte // Previous block, zero length if unknown.
+	cur   int32  // Current match offset.
+}
+
+func newDeflateFast() *deflateFast {
+	return &deflateFast{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}
+}
+
+// encode encodes a block given in src and encodes tokens
+// to dst and returns the result.
+func (e *deflateFast) encode(dst []token, src []byte) []token {
+	// Ensure that e.cur doesn't wrap.
+	if e.cur > 1<<30 {
+		*e = deflateFast{cur: maxStoreBlockSize, prev: e.prev[:0]}
+	}
+
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
+		e.cur += maxStoreBlockSize
+		e.prev = e.prev[:0]
 		return emitLiteral(dst, src)
 	}
 
-	// Initialize the hash table.
-	//
-	// The table element type is uint16, as s < sLimit and sLimit < len(src)
-	// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
-	var table [tableSize]uint16
-
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
-	sLimit := len(src) - inputMargin
+	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	nextEmit := 0
-
-	// The encoded form must start with a literal, as there are no previous
-	// bytes to copy, so we start looking for hash matches at s == 1.
-	s := 1
-	nextHash := hash(load32(src, s))
+	nextEmit := int32(0)
+	s := int32(0)
+	cv := load32(src, s)
+	nextHash := hash(cv)
 
 	for {
 		// Copied from the C++ snappy implementation:
@@ -80,10 +98,10 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 		// The "skip" variable keeps track of how many bytes there are since
 		// the last match; dividing it by 32 (ie. right-shifting by five) gives
 		// the number of bytes to move ahead for each iteration.
-		skip := 32
+		skip := int32(32)
 
 		nextS := s
-		candidate := 0
+		var candidate tableEntry
 		for {
 			s = nextS
 			bytesBetweenHashLookups := skip >> 5
@@ -92,13 +110,19 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 			if nextS > sLimit {
 				goto emitRemainder
 			}
-			candidate = int(table[nextHash&tableMask])
-			table[nextHash&tableMask] = uint16(s)
-			nextHash = hash(load32(src, nextS))
-			// TODO: < should be <=, and add a test for that.
-			if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) {
-				break
+			candidate = e.table[nextHash&tableMask]
+			now := load32(src, nextS)
+			e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
+			nextHash = hash(now)
+
+			offset := s - (candidate.offset - e.cur)
+			// TODO: >= should be >, and add a test for that.
+			if offset >= maxMatchOffset || cv != candidate.val {
+				// Out of range or not matched.
+				cv = now
+				continue
 			}
+			break
 		}
 
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
@@ -117,22 +141,16 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
-			base := s
 
 			// Extend the 4-byte match as long as possible.
 			//
-			// This is an inlined version of Snappy's:
-			//	s = extendMatch(src, candidate+4, s+4)
 			s += 4
-			s1 := base + maxMatchLength
-			if s1 > len(src) {
-				s1 = len(src)
-			}
-			for i := candidate + 4; s < s1 && src[i] == src[s]; i, s = i+1, s+1 {
-			}
+			t := candidate.offset - e.cur + 4
+			l := e.matchLen(s, t, src)
 
-			// matchToken is flate's equivalent of Snappy's emitCopy.
-			dst = append(dst, matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset)))
+			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
+			dst = append(dst, matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)))
+			s += l
 			nextEmit = s
 			if s >= sLimit {
 				goto emitRemainder
@@ -145,14 +163,18 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load64(src, s-1)
-			prevHash := hash(uint32(x >> 0))
-			table[prevHash&tableMask] = uint16(s - 1)
-			currHash := hash(uint32(x >> 8))
-			candidate = int(table[currHash&tableMask])
-			table[currHash&tableMask] = uint16(s)
+			prevHash := hash(uint32(x))
+			e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
+			x >>= 8
+			currHash := hash(uint32(x))
+			candidate = e.table[currHash&tableMask]
+			e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
+
+			offset := s - (candidate.offset - e.cur)
 			// TODO: >= should be >, and add a test for that.
-			if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
-				nextHash = hash(uint32(x >> 16))
+			if offset >= maxMatchOffset || uint32(x) != candidate.val {
+				cv = uint32(x >> 8)
+				nextHash = hash(cv)
 				s++
 				break
 			}
@@ -160,15 +182,91 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 	}
 
 emitRemainder:
-	if nextEmit < len(src) {
+	if int(nextEmit) < len(src) {
 		dst = emitLiteral(dst, src[nextEmit:])
 	}
+	e.cur += int32(len(src))
+	e.prev = e.prev[:len(src)]
+	copy(e.prev, src)
 	return dst
 }
 
 func emitLiteral(dst []token, lit []byte) []token {
 	for _, v := range lit {
-		dst = append(dst, token(v))
+		dst = append(dst, literalToken(uint32(v)))
 	}
 	return dst
 }
+
+// matchLen returns the match length between src[s:] and src[t:].
+// t can be negative to indicate the match is starting in e.prev.
+// We assume that src[s-4:s] and src[t-4:t] already match.
+func (e *deflateFast) matchLen(s, t int32, src []byte) int32 {
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// If we are inside the current block
+	if t >= 0 {
+		b := src[t:]
+		a := src[s:s1]
+		b = b[:len(a)]
+		// Extend the match to be as long as possible.
+		for i := range a {
+			if a[i] != b[i] {
+				return int32(i)
+			}
+		}
+		return int32(len(a))
+	}
+
+	// We found a match in the previous block.
+	tp := int32(len(e.prev)) + t
+	if tp < 0 {
+		return 0
+	}
+
+	// Extend the match to be as long as possible.
+	a := src[s:s1]
+	b := e.prev[tp:]
+	if len(b) > len(a) {
+		b = b[:len(a)]
+	}
+	a = a[:len(b)]
+	for i := range b {
+		if a[i] != b[i] {
+			return int32(i)
+		}
+	}
+
+	// If we reached our limit, we matched everything we are
+	// allowed to in the previous block and we return.
+	n := int32(len(b))
+	if int(s+n) == s1 {
+		return n
+	}
+
+	// Continue looking for more matches in the current block.
+	a = src[s+n : s1]
+	b = src[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return int32(i) + n
+		}
+	}
+	return int32(len(a)) + n
+}
+
+// Reset resets the encoding history.
+// This ensures that no matches are made to the previous block.
+func (e *deflateFast) reset() {
+	e.prev = e.prev[:0]
+	// Bump the offset, so all matches will fail distance check.
+	e.cur += maxMatchOffset
+
+	// Protect against e.cur wraparound.
+	if e.cur > 1<<30 {
+		*e = deflateFast{cur: maxStoreBlockSize, prev: e.prev[:0]}
+	}
+}
-- 
2.52.0