From 53efe1e121e68feadcce00d1f7b1978dbbd1d1c7 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 21 Mar 2016 10:21:55 +0100 Subject: [PATCH] compress/flate: rework matching algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This changes how matching is done in deflate algorithm. The major change is that we do not look for matches that are only 3 bytes in length, matches must be 4 bytes at least. Contrary to what you would expect this actually improves the compresion ratio, since 3 literal bytes will often be shorter than a match after huffman encoding. This varies a bit by source, but is most often the case when the source is "easy" to compress. Second of all, a "stronger" hash is used. The hash is similar to the hashing function used by Snappy. Overall, the speed impact is biggest on higher compression levels. I intend to replace the "speed" compression level, which can be seen in CL 21021. The built-in benchmark using "digits" is slower at level 1. I see this as an exception, since "digits" is a special type of data, where you have low entropy (numbers 0->9), but no significant matches. Again, CL 20021 fixes that case. NewWriterDict is also made considerably faster, by not running data through the entire encoder. This is not reflected by the benchmark. Overall, the speed impact is biggest on higher compression levels. I intend to replace the "speed" compression level. COMPARED to tip/master: name old time/op new time/op delta EncodeDigitsSpeed1e4-4 401µs ± 1% 345µs ± 2% -13.95% EncodeDigitsSpeed1e5-4 3.19ms ± 1% 4.27ms ± 3% +33.96% EncodeDigitsSpeed1e6-4 27.7ms ± 4% 43.8ms ± 3% +58.00% EncodeDigitsDefault1e4-4 641µs ± 0% 403µs ± 1% -37.15% EncodeDigitsDefault1e5-4 13.8ms ± 1% 6.4ms ± 3% -53.73% EncodeDigitsDefault1e6-4 162ms ± 1% 64ms ± 2% -60.51% EncodeDigitsCompress1e4-4 627µs ± 1% 405µs ± 2% -35.45% EncodeDigitsCompress1e5-4 13.9ms ± 0% 6.3ms ± 2% -54.46% EncodeDigitsCompress1e6-4 159ms ± 1% 64ms ± 0% -59.91% EncodeTwainSpeed1e4-4 433µs ± 4% 331µs ± 1% -23.53% EncodeTwainSpeed1e5-4 2.82ms ± 1% 3.08ms ± 0% +9.10% EncodeTwainSpeed1e6-4 28.1ms ± 2% 28.8ms ± 0% +2.82% EncodeTwainDefault1e4-4 695µs ± 4% 474µs ± 1% -31.78% EncodeTwainDefault1e5-4 11.8ms ± 0% 7.4ms ± 0% -37.31% EncodeTwainDefault1e6-4 128ms ± 0% 75ms ± 0% -40.93% EncodeTwainCompress1e4-4 719µs ± 3% 480µs ± 0% -33.27% EncodeTwainCompress1e5-4 15.0ms ± 3% 8.2ms ± 2% -45.55% EncodeTwainCompress1e6-4 170ms ± 0% 85ms ± 1% -49.99% name old speed new speed delta EncodeDigitsSpeed1e4-4 25.0MB/s ± 1% 29.0MB/s ± 2% +16.24% EncodeDigitsSpeed1e5-4 31.4MB/s ± 1% 23.4MB/s ± 3% -25.34% EncodeDigitsSpeed1e6-4 36.1MB/s ± 4% 22.8MB/s ± 3% -36.74% EncodeDigitsDefault1e4-4 15.6MB/s ± 0% 24.8MB/s ± 1% +59.11% EncodeDigitsDefault1e5-4 7.27MB/s ± 1% 15.72MB/s ± 3% +116.23% EncodeDigitsDefault1e6-4 6.16MB/s ± 0% 15.60MB/s ± 2% +153.25% EncodeDigitsCompress1e4-4 15.9MB/s ± 1% 24.7MB/s ± 2% +54.97% EncodeDigitsCompress1e5-4 7.19MB/s ± 0% 15.78MB/s ± 2% +119.62% EncodeDigitsCompress1e6-4 6.27MB/s ± 1% 15.65MB/s ± 0% +149.52% EncodeTwainSpeed1e4-4 23.1MB/s ± 4% 30.2MB/s ± 1% +30.68% EncodeTwainSpeed1e5-4 35.4MB/s ± 1% 32.5MB/s ± 0% -8.34% EncodeTwainSpeed1e6-4 35.6MB/s ± 2% 34.7MB/s ± 0% -2.77% EncodeTwainDefault1e4-4 14.4MB/s ± 4% 21.1MB/s ± 1% +46.48% EncodeTwainDefault1e5-4 8.49MB/s ± 0% 13.55MB/s ± 0% +59.50% EncodeTwainDefault1e6-4 7.83MB/s ± 0% 13.25MB/s ± 0% +69.19% EncodeTwainCompress1e4-4 13.9MB/s ± 3% 20.8MB/s ± 0% +49.83% EncodeTwainCompress1e5-4 6.65MB/s ± 3% 12.20MB/s ± 2% +83.51% EncodeTwainCompress1e6-4 5.88MB/s ± 0% 11.76MB/s ± 1% +100.06% Change-Id: I724e33c1dd3e3a6a1b0a68e094baa959352baf32 Reviewed-on: https://go-review.googlesource.com/20929 Run-TryBot: Nigel Tao Reviewed-by: Nigel Tao --- src/compress/flate/deflate.go | 237 ++++++++++++++++++----------- src/compress/flate/deflate_test.go | 63 +++++++- 2 files changed, 210 insertions(+), 90 deletions(-) diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go index 199fc4cf3c..428f2508d3 100644 --- a/src/compress/flate/deflate.go +++ b/src/compress/flate/deflate.go @@ -13,14 +13,13 @@ import ( const ( NoCompression = 0 BestSpeed = 1 - fastCompression = 3 BestCompression = 9 DefaultCompression = -1 logWindowSize = 15 windowSize = 1 << logWindowSize windowMask = windowSize - 1 logMaxOffsetSize = 15 // Standard DEFLATE - minMatchLength = 3 // The smallest match that the compressor looks for + minMatchLength = 4 // The smallest match that the compressor looks for maxMatchLength = 258 // The longest match for the compressor minOffsetSize = 1 // The shortest offset that makes any sense @@ -28,39 +27,39 @@ const ( // stop things from getting too large. maxFlateBlockTokens = 1 << 14 maxStoreBlockSize = 65535 - hashBits = 17 + hashBits = 17 // After 17 performance degrades hashSize = 1 << hashBits hashMask = (1 << hashBits) - 1 - hashShift = (hashBits + minMatchLength - 1) / minMatchLength maxHashOffset = 1 << 24 skipNever = math.MaxInt32 ) type compressionLevel struct { - good, lazy, nice, chain, fastSkipHashing int + level, good, lazy, nice, chain, fastSkipHashing int } var levels = []compressionLevel{ {}, // 0 // For levels 1-3 we don't bother trying with lazy matches - {3, 0, 8, 4, 4}, - {3, 0, 16, 8, 5}, - {3, 0, 32, 32, 6}, + {1, 4, 0, 8, 4, 4}, + {2, 4, 0, 16, 8, 5}, + {3, 4, 0, 32, 32, 6}, // Levels 4-9 use increasingly more lazy matching // and increasingly stringent conditions for "good enough". - {4, 4, 16, 16, skipNever}, - {8, 16, 32, 32, skipNever}, - {8, 16, 128, 128, skipNever}, - {8, 32, 128, 256, skipNever}, - {32, 128, 258, 1024, skipNever}, - {32, 258, 258, 4096, skipNever}, + {4, 4, 4, 16, 16, skipNever}, + {5, 8, 16, 32, 32, skipNever}, + {6, 8, 16, 128, 128, skipNever}, + {7, 8, 32, 128, 256, skipNever}, + {8, 32, 128, 258, 1024, skipNever}, + {9, 32, 258, 258, 4096, skipNever}, } type compressor struct { compressionLevel - w *huffmanBitWriter + w *huffmanBitWriter + bulkHasher func([]byte, []uint32) // compression algorithm fill func(*compressor, []byte) int // copy data to window @@ -73,8 +72,8 @@ type compressor struct { // hashPrev[hashHead[hashValue] & windowMask] contains the previous index // with the same hash value. chainHead int - hashHead []int - hashPrev []int + hashHead []uint32 + hashPrev []uint32 hashOffset int // input window: unprocessed data is window[index:windowEnd] @@ -90,9 +89,12 @@ type compressor struct { // deflate state length int offset int - hash int + hash uint32 maxInsertIndex int err error + + // hashMatch must be able to contain hashes for the maximum match length. + hashMatch [maxMatchLength - 1]uint32 } func (d *compressor) fillDeflate(b []byte) int { @@ -112,15 +114,15 @@ func (d *compressor) fillDeflate(b []byte) int { d.hashOffset -= delta d.chainHead -= delta for i, v := range d.hashPrev { - if v > delta { - d.hashPrev[i] -= delta + if int(v) > delta { + d.hashPrev[i] = uint32(int(v) - delta) } else { d.hashPrev[i] = 0 } } for i, v := range d.hashHead { - if v > delta { - d.hashHead[i] -= delta + if int(v) > delta { + d.hashHead[i] = uint32(int(v) - delta) } else { d.hashHead[i] = 0 } @@ -132,19 +134,73 @@ func (d *compressor) fillDeflate(b []byte) int { return n } -func (d *compressor) writeBlock(tokens []token, index int, eof bool) error { - if index > 0 || eof { +func (d *compressor) writeBlock(tokens []token, index int) error { + if index > 0 { var window []byte if d.blockStart <= index { window = d.window[d.blockStart:index] } d.blockStart = index - d.w.writeBlock(tokens, eof, window) + d.w.writeBlock(tokens, false, window) return d.w.err } return nil } +// fillWindow will fill the current window with the supplied +// dictionary and calculate all hashes. +// This is much faster than doing a full encode. +// Should only be used after a reset. +func (d *compressor) fillWindow(b []byte) { + // Do not fill window if we are in store-only mode. + if d.compressionLevel.level == 0 { + return + } + if d.index != 0 || d.windowEnd != 0 { + panic("internal error: fillWindow called with stale data") + } + + // If we are given too much, cut it. + if len(b) > windowSize { + b = b[len(b)-windowSize:] + } + // Add all to window. + n := copy(d.window, b) + + // Calculate 256 hashes at the time (more L1 cache hits) + loops := (n + 256 - minMatchLength) / 256 + for j := 0; j < loops; j++ { + index := j * 256 + end := index + 256 + minMatchLength - 1 + if end > n { + end = n + } + toCheck := d.window[index:end] + dstSize := len(toCheck) - minMatchLength + 1 + + if dstSize <= 0 { + continue + } + + dst := d.hashMatch[:dstSize] + d.bulkHasher(toCheck, dst) + var newH uint32 + for i, val := range dst { + di := i + index + newH = val & hashMask + // Get previous value with the same hash. + // Our chain should point to the previous value. + d.hashPrev[di&windowMask] = d.hashHead[newH] + // Set the head of the hash chain to us. + d.hashHead[newH] = uint32(di + d.hashOffset) + } + d.hash = newH + } + // Update window information. + d.windowEnd = n + d.index = n +} + // Try to find a match starting at index whose length is greater than prevSize. // We only look at chainCount possibilities before giving up. func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) { @@ -168,20 +224,15 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead tries >>= 2 } - w0 := win[pos] - w1 := win[pos+1] wEnd := win[pos+length] + wPos := win[pos:] minIndex := pos - windowSize for i := prevHead; tries > 0; tries-- { - if w0 == win[i] && w1 == win[i+1] && wEnd == win[i+length] { - // The hash function ensures that if win[i] and win[i+1] match, win[i+2] matches + if wEnd == win[i+length] { + n := matchLen(win[i:], wPos, minMatchLook) - n := 3 - for pos+n < len(win) && win[i+n] == win[pos+n] { - n++ - } - if n > length && (n > 3 || pos-i <= 4096) { + if n > length && (n > minMatchLength || pos-i <= 4096) { length = n offset = pos - i ok = true @@ -196,7 +247,8 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead // hashPrev[i & windowMask] has already been overwritten, so stop now. break } - if i = d.hashPrev[i&windowMask] - d.hashOffset; i < minIndex || i < 0 { + i = int(d.hashPrev[i&windowMask]) - d.hashOffset + if i < minIndex || i < 0 { break } } @@ -211,9 +263,46 @@ func (d *compressor) writeStoredBlock(buf []byte) error { return d.w.err } +const hashmul = 0x1e35a7bd + +// hash4 returns a hash representation of the first 4 bytes +// of the supplied slice. +// The caller must ensure that len(b) >= 4. +func hash4(b []byte) uint32 { + return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits) +} + +// bulkHash4 will compute hashes using the same +// algorithm as hash4 +func bulkHash4(b []byte, dst []uint32) { + if len(b) < minMatchLength { + return + } + hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 + dst[0] = (hb * hashmul) >> (32 - hashBits) + end := len(b) - minMatchLength + 1 + for i := 1; i < end; i++ { + hb = (hb << 8) | uint32(b[i+3]) + dst[i] = (hb * hashmul) >> (32 - hashBits) + } +} + +// matchLen returns the number of matching bytes in a and b +// up to length 'max'. Both slices must be at least 'max' +// bytes in size. +func matchLen(a, b []byte, max int) int { + a = a[:max] + for i, av := range a { + if b[i] != av { + return i + } + } + return max +} + func (d *compressor) initDeflate() { - d.hashHead = make([]int, hashSize) - d.hashPrev = make([]int, windowSize) + d.hashHead = make([]uint32, hashSize) + d.hashPrev = make([]uint32, windowSize) d.window = make([]byte, 2*windowSize) d.hashOffset = 1 d.tokens = make([]token, 0, maxFlateBlockTokens+1) @@ -223,6 +312,7 @@ func (d *compressor) initDeflate() { d.index = 0 d.hash = 0 d.chainHead = -1 + d.bulkHasher = bulkHash4 } func (d *compressor) deflate() { @@ -232,7 +322,7 @@ func (d *compressor) deflate() { d.maxInsertIndex = d.windowEnd - (minMatchLength - 1) if d.index < d.maxInsertIndex { - d.hash = int(d.window[d.index])< 0 { - if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil { + if d.err = d.writeBlock(d.tokens, d.index); d.err != nil { return } d.tokens = d.tokens[:0] @@ -266,10 +356,10 @@ Loop: } if d.index < d.maxInsertIndex { // Update the hash - d.hash = (d.hash< 0 { d.step(d) b = b[d.fill(d, b):] @@ -425,42 +514,29 @@ func (d *compressor) init(w io.Writer, level int) (err error) { return nil } -var zeroes [32]int -var bzeroes [256]byte +// hzeroes is used for zeroing the hash slice. +var hzeroes [256]uint32 func (d *compressor) reset(w io.Writer) { d.w.reset(w) d.sync = false d.err = nil - switch d.compressionLevel.chain { - case 0: - // level was NoCompression. - for i := range d.window { - d.window[i] = 0 - } + switch d.compressionLevel.level { + case NoCompression: d.windowEnd = 0 default: d.chainHead = -1 for s := d.hashHead; len(s) > 0; { - n := copy(s, zeroes[:]) + n := copy(s, hzeroes[:]) s = s[n:] } - for s := d.hashPrev; len(s) > 0; s = s[len(zeroes):] { - copy(s, zeroes[:]) + for s := d.hashPrev; len(s) > 0; s = s[len(hzeroes):] { + copy(s, hzeroes[:]) } d.hashOffset = 1 d.index, d.windowEnd = 0, 0 - for s := d.window; len(s) > 0; { - n := copy(s, bzeroes[:]) - s = s[n:] - } d.blockStart, d.byteAvailable = 0, false - - d.tokens = d.tokens[:maxFlateBlockTokens+1] - for i := 0; i <= maxFlateBlockTokens; i++ { - d.tokens[i] = 0 - } d.tokens = d.tokens[:0] d.length = minMatchLength - 1 d.offset = 0 @@ -509,28 +585,22 @@ func NewWriter(w io.Writer, level int) (*Writer, error) { // can only be decompressed by a Reader initialized with the // same dictionary. func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) { - dw := &dictWriter{w, false} + dw := &dictWriter{w} zw, err := NewWriter(dw, level) if err != nil { return nil, err } - zw.Write(dict) - zw.Flush() - dw.enabled = true + zw.d.fillWindow(dict) zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method. return zw, err } type dictWriter struct { - w io.Writer - enabled bool + w io.Writer } func (w *dictWriter) Write(b []byte) (n int, err error) { - if w.enabled { - return w.w.Write(b) - } - return len(b), nil + return w.w.Write(b) } // A Writer takes data written to it and writes the compressed @@ -572,10 +642,7 @@ func (w *Writer) Reset(dst io.Writer) { // w was created with NewWriterDict dw.w = dst w.d.reset(dw) - dw.enabled = false - w.Write(w.dict) - w.Flush() - dw.enabled = true + w.d.fillWindow(w.dict) } else { // w was created with NewWriter w.d.reset(dst) diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go index c165ade734..6201213f01 100644 --- a/src/compress/flate/deflate_test.go +++ b/src/compress/flate/deflate_test.go @@ -80,6 +80,32 @@ func largeDataChunk() []byte { return result } +func TestBulkHash4(t *testing.T) { + for _, x := range deflateTests { + y := x.out + if len(y) < minMatchLength { + continue + } + y = append(y, y...) + for j := 4; j < len(y); j++ { + y := y[:j] + dst := make([]uint32, len(y)-minMatchLength+1) + for i := range dst { + dst[i] = uint32(i + 100) + } + bulkHash4(y, dst) + for i, got := range dst { + want := hash4(y[i:]) + if got != want && got == uint32(i)+100 { + t.Errorf("Len:%d Index:%d, want 0x%08x but not modified", len(y), i, want) + } else if got != want { + t.Errorf("Len:%d Index:%d, got 0x%08x want:0x%08x", len(y), i, got, want) + } + } + } + } +} + func TestDeflate(t *testing.T) { for _, h := range deflateTests { var buf bytes.Buffer @@ -91,7 +117,7 @@ func TestDeflate(t *testing.T) { w.Write(h.in) w.Close() if !bytes.Equal(buf.Bytes(), h.out) { - t.Errorf("Deflate(%d, %x) = %x, want %x", h.level, h.in, buf.Bytes(), h.out) + t.Errorf("Deflate(%d, %x) = \n%#v, want \n%#v", h.level, h.in, buf.Bytes(), h.out) } } } @@ -289,6 +315,9 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit) return } + if limit > 0 { + t.Logf("level: %d, size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len()) + } r := NewReader(&buffer) out, err := ioutil.ReadAll(r) if err != nil { @@ -457,6 +486,17 @@ func TestWriterReset(t *testing.T) { // DeepEqual doesn't compare functions. w.d.fill, wref.d.fill = nil, nil w.d.step, wref.d.step = nil, nil + w.d.bulkHasher, wref.d.bulkHasher = nil, nil + // hashMatch is always overwritten when used. + copy(w.d.hashMatch[:], wref.d.hashMatch[:]) + if len(w.d.tokens) != 0 { + t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, len(w.d.tokens)) + } + // As long as the length is 0, we don't care about the content. + w.d.tokens = wref.d.tokens + + // We don't care if there are values in the window, as long as it is at d.index is 0 + w.d.window = wref.d.window if !reflect.DeepEqual(w, wref) { t.Errorf("level %d Writer not reset after Reset", level) } @@ -481,7 +521,7 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error)) w.Write(b) } w.Close() - out1 := buf.String() + out1 := buf.Bytes() buf2 := new(bytes.Buffer) w.Reset(buf2) @@ -489,10 +529,23 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error)) w.Write(b) } w.Close() - out2 := buf2.String() + out2 := buf2.Bytes() - if out1 != out2 { - t.Errorf("got %q, expected %q", out2, out1) + if len(out1) != len(out2) { + t.Errorf("got %d, expected %d bytes", len(out2), len(out1)) + return + } + if !bytes.Equal(out1, out2) { + mm := 0 + for i, b := range out1[:len(out2)] { + if b != out2[i] { + t.Errorf("mismatch index %d: %#02x, expected %#02x", i, out2[i], b) + } + mm++ + if mm == 10 { + t.Fatal("Stopping") + } + } } t.Logf("got %d bytes", len(out1)) } -- 2.48.1