compress/flate: rework matching algorithm

author Klaus Post <klauspost@gmail.com>

Mon, 21 Mar 2016 09:21:55 +0000 (10:21 +0100)

committer Nigel Tao <nigeltao@golang.org>

Wed, 23 Mar 2016 11:33:29 +0000 (11:33 +0000)
author Klaus Post <klauspost@gmail.com>
Mon, 21 Mar 2016 09:21:55 +0000 (10:21 +0100)
committer Nigel Tao <nigeltao@golang.org>
Wed, 23 Mar 2016 11:33:29 +0000 (11:33 +0000)
diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go

index 199fc4cf3c78f3930c4ecfd42a2b8eedbd80c479..428f2508d3be7cb8b111c8e310fa29cc811a3fec 100644 (file)
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -13,14 +13,13 @@ import (
  const (
         NoCompression      = 0
         BestSpeed          = 1
-       fastCompression    = 3
         BestCompression    = 9
         DefaultCompression = -1
         logWindowSize      = 15
         windowSize         = 1 << logWindowSize
         windowMask         = windowSize - 1
         logMaxOffsetSize   = 15  // Standard DEFLATE
-       minMatchLength     = 3   // The smallest match that the compressor looks for
+       minMatchLength     = 4   // The smallest match that the compressor looks for
         maxMatchLength     = 258 // The longest match for the compressor
         minOffsetSize      = 1   // The shortest offset that makes any sense
  
@@ -28,39 +27,39 @@ const (
         // stop things from getting too large.
         maxFlateBlockTokens = 1 << 14
         maxStoreBlockSize   = 65535
-       hashBits            = 17
+       hashBits            = 17 // After 17 performance degrades
         hashSize            = 1 << hashBits
         hashMask            = (1 << hashBits) - 1
-       hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
         maxHashOffset       = 1 << 24
  
         skipNever = math.MaxInt32
  )
  
  type compressionLevel struct {
-       good, lazy, nice, chain, fastSkipHashing int
+       level, good, lazy, nice, chain, fastSkipHashing int
  }
  
  var levels = []compressionLevel{
         {}, // 0
         // For levels 1-3 we don't bother trying with lazy matches
-       {3, 0, 8, 4, 4},
-       {3, 0, 16, 8, 5},
-       {3, 0, 32, 32, 6},
+       {1, 4, 0, 8, 4, 4},
+       {2, 4, 0, 16, 8, 5},
+       {3, 4, 0, 32, 32, 6},
         // Levels 4-9 use increasingly more lazy matching
         // and increasingly stringent conditions for "good enough".
-       {4, 4, 16, 16, skipNever},
-       {8, 16, 32, 32, skipNever},
-       {8, 16, 128, 128, skipNever},
-       {8, 32, 128, 256, skipNever},
-       {32, 128, 258, 1024, skipNever},
-       {32, 258, 258, 4096, skipNever},
+       {4, 4, 4, 16, 16, skipNever},
+       {5, 8, 16, 32, 32, skipNever},
+       {6, 8, 16, 128, 128, skipNever},
+       {7, 8, 32, 128, 256, skipNever},
+       {8, 32, 128, 258, 1024, skipNever},
+       {9, 32, 258, 258, 4096, skipNever},
  }
  
  type compressor struct {
         compressionLevel
  
-       w *huffmanBitWriter
+       w          *huffmanBitWriter
+       bulkHasher func([]byte, []uint32)
  
         // compression algorithm
         fill func(*compressor, []byte) int // copy data to window
@@ -73,8 +72,8 @@ type compressor struct {
         // hashPrev[hashHead[hashValue] & windowMask] contains the previous index
         // with the same hash value.
         chainHead  int
-       hashHead   []int
-       hashPrev   []int
+       hashHead   []uint32
+       hashPrev   []uint32
         hashOffset int
  
         // input window: unprocessed data is window[index:windowEnd]
@@ -90,9 +89,12 @@ type compressor struct {
         // deflate state
         length         int
         offset         int
-       hash           int
+       hash           uint32
         maxInsertIndex int
         err            error
+
+       // hashMatch must be able to contain hashes for the maximum match length.
+       hashMatch [maxMatchLength - 1]uint32
  }
  
  func (d *compressor) fillDeflate(b []byte) int {
@@ -112,15 +114,15 @@ func (d *compressor) fillDeflate(b []byte) int {
                         d.hashOffset -= delta
                         d.chainHead -= delta
                         for i, v := range d.hashPrev {
-                               if v > delta {
-                                       d.hashPrev[i] -= delta
+                               if int(v) > delta {
+                                       d.hashPrev[i] = uint32(int(v) - delta)
                                 } else {
                                         d.hashPrev[i] = 0
                                 }
                         }
                         for i, v := range d.hashHead {
-                               if v > delta {
-                                       d.hashHead[i] -= delta
+                               if int(v) > delta {
+                                       d.hashHead[i] = uint32(int(v) - delta)
                                 } else {
                                         d.hashHead[i] = 0
                                 }
@@ -132,19 +134,73 @@ func (d *compressor) fillDeflate(b []byte) int {
         return n
  }
  
-func (d *compressor) writeBlock(tokens []token, index int, eof bool) error {
-       if index > 0 || eof {
+func (d *compressor) writeBlock(tokens []token, index int) error {
+       if index > 0 {
                 var window []byte
                 if d.blockStart <= index {
                         window = d.window[d.blockStart:index]
                 }
                 d.blockStart = index
-               d.w.writeBlock(tokens, eof, window)
+               d.w.writeBlock(tokens, false, window)
                 return d.w.err
         }
         return nil
  }
  
+// fillWindow will fill the current window with the supplied
+// dictionary and calculate all hashes.
+// This is much faster than doing a full encode.
+// Should only be used after a reset.
+func (d *compressor) fillWindow(b []byte) {
+       // Do not fill window if we are in store-only mode.
+       if d.compressionLevel.level == 0 {
+               return
+       }
+       if d.index != 0 || d.windowEnd != 0 {
+               panic("internal error: fillWindow called with stale data")
+       }
+
+       // If we are given too much, cut it.
+       if len(b) > windowSize {
+               b = b[len(b)-windowSize:]
+       }
+       // Add all to window.
+       n := copy(d.window, b)
+
+       // Calculate 256 hashes at the time (more L1 cache hits)
+       loops := (n + 256 - minMatchLength) / 256
+       for j := 0; j < loops; j++ {
+               index := j * 256
+               end := index + 256 + minMatchLength - 1
+               if end > n {
+                       end = n
+               }
+               toCheck := d.window[index:end]
+               dstSize := len(toCheck) - minMatchLength + 1
+
+               if dstSize <= 0 {
+                       continue
+               }
+
+               dst := d.hashMatch[:dstSize]
+               d.bulkHasher(toCheck, dst)
+               var newH uint32
+               for i, val := range dst {
+                       di := i + index
+                       newH = val & hashMask
+                       // Get previous value with the same hash.
+                       // Our chain should point to the previous value.
+                       d.hashPrev[di&windowMask] = d.hashHead[newH]
+                       // Set the head of the hash chain to us.
+                       d.hashHead[newH] = uint32(di + d.hashOffset)
+               }
+               d.hash = newH
+       }
+       // Update window information.
+       d.windowEnd = n
+       d.index = n
+}
+
  // Try to find a match starting at index whose length is greater than prevSize.
  // We only look at chainCount possibilities before giving up.
  func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
@@ -168,20 +224,15 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
                 tries >>= 2
         }
  
-       w0 := win[pos]
-       w1 := win[pos+1]
         wEnd := win[pos+length]
+       wPos := win[pos:]
         minIndex := pos - windowSize
  
         for i := prevHead; tries > 0; tries-- {
-               if w0 == win[i] && w1 == win[i+1] && wEnd == win[i+length] {
-                       // The hash function ensures that if win[i] and win[i+1] match, win[i+2] matches
+               if wEnd == win[i+length] {
+                       n := matchLen(win[i:], wPos, minMatchLook)
  
-                       n := 3
-                       for pos+n < len(win) && win[i+n] == win[pos+n] {
-                               n++
-                       }
-                       if n > length && (n > 3 || pos-i <= 4096) {
+                       if n > length && (n > minMatchLength || pos-i <= 4096) {
                                 length = n
                                 offset = pos - i
                                 ok = true
@@ -196,7 +247,8 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
                         // hashPrev[i & windowMask] has already been overwritten, so stop now.
                         break
                 }
-               if i = d.hashPrev[i&windowMask] - d.hashOffset; i < minIndex || i < 0 {
+               i = int(d.hashPrev[i&windowMask]) - d.hashOffset
+               if i < minIndex || i < 0 {
                         break
                 }
         }
@@ -211,9 +263,46 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
         return d.w.err
  }
  
+const hashmul = 0x1e35a7bd
+
+// hash4 returns a hash representation of the first 4 bytes
+// of the supplied slice.
+// The caller must ensure that len(b) >= 4.
+func hash4(b []byte) uint32 {
+       return ((uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24) * hashmul) >> (32 - hashBits)
+}
+
+// bulkHash4 will compute hashes using the same
+// algorithm as hash4
+func bulkHash4(b []byte, dst []uint32) {
+       if len(b) < minMatchLength {
+               return
+       }
+       hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+       dst[0] = (hb * hashmul) >> (32 - hashBits)
+       end := len(b) - minMatchLength + 1
+       for i := 1; i < end; i++ {
+               hb = (hb << 8) | uint32(b[i+3])
+               dst[i] = (hb * hashmul) >> (32 - hashBits)
+       }
+}
+
+// matchLen returns the number of matching bytes in a and b
+// up to length 'max'. Both slices must be at least 'max'
+// bytes in size.
+func matchLen(a, b []byte, max int) int {
+       a = a[:max]
+       for i, av := range a {
+               if b[i] != av {
+                       return i
+               }
+       }
+       return max
+}
+
  func (d *compressor) initDeflate() {
-       d.hashHead = make([]int, hashSize)
-       d.hashPrev = make([]int, windowSize)
+       d.hashHead = make([]uint32, hashSize)
+       d.hashPrev = make([]uint32, windowSize)
         d.window = make([]byte, 2*windowSize)
         d.hashOffset = 1
         d.tokens = make([]token, 0, maxFlateBlockTokens+1)
@@ -223,6 +312,7 @@ func (d *compressor) initDeflate() {
         d.index = 0
         d.hash = 0
         d.chainHead = -1
+       d.bulkHasher = bulkHash4
  }
  
  func (d *compressor) deflate() {
@@ -232,7 +322,7 @@ func (d *compressor) deflate() {
  
         d.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
         if d.index < d.maxInsertIndex {
-               d.hash = int(d.window[d.index])<<hashShift + int(d.window[d.index+1])
+               d.hash = hash4(d.window[d.index : d.index+minMatchLength])
         }
  
  Loop:
@@ -256,7 +346,7 @@ Loop:
                                         d.byteAvailable = false
                                 }
                                 if len(d.tokens) > 0 {
-                                       if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+                                       if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
                                                 return
                                         }
                                         d.tokens = d.tokens[:0]
@@ -266,10 +356,10 @@ Loop:
                 }
                 if d.index < d.maxInsertIndex {
                         // Update the hash
-                       d.hash = (d.hash<<hashShift + int(d.window[d.index+2])) & hashMask
-                       d.chainHead = d.hashHead[d.hash]
-                       d.hashPrev[d.index&windowMask] = d.chainHead
-                       d.hashHead[d.hash] = d.index + d.hashOffset
+                       d.hash = hash4(d.window[d.index : d.index+minMatchLength])
+                       d.chainHead = int(d.hashHead[d.hash])
+                       d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
+                       d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
                 }
                 prevLength := d.length
                 prevOffset := d.offset
@@ -293,9 +383,9 @@ Loop:
                         // There was a match at the previous step, and the current match is
                         // not better. Output the previous match.
                         if d.fastSkipHashing != skipNever {
-                               d.tokens = append(d.tokens, matchToken(uint32(d.length-minMatchLength), uint32(d.offset-minOffsetSize)))
+                               d.tokens = append(d.tokens, matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize)))
                         } else {
-                               d.tokens = append(d.tokens, matchToken(uint32(prevLength-minMatchLength), uint32(prevOffset-minOffsetSize)))
+                               d.tokens = append(d.tokens, matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)))
                         }
                         // Insert in the hash table all strings up to the end of the match.
                         // index and index-1 are already inserted. If there is not enough
@@ -310,12 +400,12 @@ Loop:
                                 }
                                 for d.index++; d.index < newIndex; d.index++ {
                                         if d.index < d.maxInsertIndex {
-                                               d.hash = (d.hash<<hashShift + int(d.window[d.index+2])) & hashMask
+                                               d.hash = hash4(d.window[d.index : d.index+minMatchLength])
                                                 // Get previous value with the same hash.
                                                 // Our chain should point to the previous value.
                                                 d.hashPrev[d.index&windowMask] = d.hashHead[d.hash]
                                                 // Set the head of the hash chain to us.
-                                               d.hashHead[d.hash] = d.index + d.hashOffset
+                                               d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
                                         }
                                 }
                                 if d.fastSkipHashing == skipNever {
@@ -327,12 +417,12 @@ Loop:
                                 // item into the table.
                                 d.index += d.length
                                 if d.index < d.maxInsertIndex {
-                                       d.hash = (int(d.window[d.index])<<hashShift + int(d.window[d.index+1]))
+                                       d.hash = hash4(d.window[d.index : d.index+minMatchLength])
                                 }
                         }
                         if len(d.tokens) == maxFlateBlockTokens {
                                 // The block includes the current character
-                               if d.err = d.writeBlock(d.tokens, d.index, false); d.err != nil {
+                               if d.err = d.writeBlock(d.tokens, d.index); d.err != nil {
                                         return
                                 }
                                 d.tokens = d.tokens[:0]
@@ -345,7 +435,7 @@ Loop:
                                 }
                                 d.tokens = append(d.tokens, literalToken(uint32(d.window[i])))
                                 if len(d.tokens) == maxFlateBlockTokens {
-                                       if d.err = d.writeBlock(d.tokens, i+1, false); d.err != nil {
+                                       if d.err = d.writeBlock(d.tokens, i+1); d.err != nil {
                                                 return
                                         }
                                         d.tokens = d.tokens[:0]
@@ -377,7 +467,6 @@ func (d *compressor) write(b []byte) (n int, err error) {
                 return 0, d.err
         }
         n = len(b)
-       b = b[d.fill(d, b):]
         for len(b) > 0 {
                 d.step(d)
                 b = b[d.fill(d, b):]
@@ -425,42 +514,29 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
         return nil
  }
  
-var zeroes [32]int
-var bzeroes [256]byte
+// hzeroes is used for zeroing the hash slice.
+var hzeroes [256]uint32
  
  func (d *compressor) reset(w io.Writer) {
         d.w.reset(w)
         d.sync = false
         d.err = nil
-       switch d.compressionLevel.chain {
-       case 0:
-               // level was NoCompression.
-               for i := range d.window {
-                       d.window[i] = 0
-               }
+       switch d.compressionLevel.level {
+       case NoCompression:
                 d.windowEnd = 0
         default:
                 d.chainHead = -1
                 for s := d.hashHead; len(s) > 0; {
-                       n := copy(s, zeroes[:])
+                       n := copy(s, hzeroes[:])
                         s = s[n:]
                 }
-               for s := d.hashPrev; len(s) > 0; s = s[len(zeroes):] {
-                       copy(s, zeroes[:])
+               for s := d.hashPrev; len(s) > 0; s = s[len(hzeroes):] {
+                       copy(s, hzeroes[:])
                 }
                 d.hashOffset = 1
  
                 d.index, d.windowEnd = 0, 0
-               for s := d.window; len(s) > 0; {
-                       n := copy(s, bzeroes[:])
-                       s = s[n:]
-               }
                 d.blockStart, d.byteAvailable = 0, false
-
-               d.tokens = d.tokens[:maxFlateBlockTokens+1]
-               for i := 0; i <= maxFlateBlockTokens; i++ {
-                       d.tokens[i] = 0
-               }
                 d.tokens = d.tokens[:0]
                 d.length = minMatchLength - 1
                 d.offset = 0
@@ -509,28 +585,22 @@ func NewWriter(w io.Writer, level int) (*Writer, error) {
  // can only be decompressed by a Reader initialized with the
  // same dictionary.
  func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
-       dw := &dictWriter{w, false}
+       dw := &dictWriter{w}
         zw, err := NewWriter(dw, level)
         if err != nil {
                 return nil, err
         }
-       zw.Write(dict)
-       zw.Flush()
-       dw.enabled = true
+       zw.d.fillWindow(dict)
         zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
         return zw, err
  }
  
  type dictWriter struct {
-       w       io.Writer
-       enabled bool
+       w io.Writer
  }
  
  func (w *dictWriter) Write(b []byte) (n int, err error) {
-       if w.enabled {
-               return w.w.Write(b)
-       }
-       return len(b), nil
+       return w.w.Write(b)
  }
  
  // A Writer takes data written to it and writes the compressed
@@ -572,10 +642,7 @@ func (w *Writer) Reset(dst io.Writer) {
                 // w was created with NewWriterDict
                 dw.w = dst
                 w.d.reset(dw)
-               dw.enabled = false
-               w.Write(w.dict)
-               w.Flush()
-               dw.enabled = true
+               w.d.fillWindow(w.dict)
         } else {
                 // w was created with NewWriter
                 w.d.reset(dst)
diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go

index c165ade73495a424f7c6c738dae340b791aaeffb..6201213f013f181ceb189a116ef7ce1ccbbca801 100644 (file)
--- a/src/compress/flate/deflate_test.go
+++ b/src/compress/flate/deflate_test.go
@@ -80,6 +80,32 @@ func largeDataChunk() []byte {
         return result
  }
  
+func TestBulkHash4(t *testing.T) {
+       for _, x := range deflateTests {
+               y := x.out
+               if len(y) < minMatchLength {
+                       continue
+               }
+               y = append(y, y...)
+               for j := 4; j < len(y); j++ {
+                       y := y[:j]
+                       dst := make([]uint32, len(y)-minMatchLength+1)
+                       for i := range dst {
+                               dst[i] = uint32(i + 100)
+                       }
+                       bulkHash4(y, dst)
+                       for i, got := range dst {
+                               want := hash4(y[i:])
+                               if got != want && got == uint32(i)+100 {
+                                       t.Errorf("Len:%d Index:%d, want 0x%08x but not modified", len(y), i, want)
+                               } else if got != want {
+                                       t.Errorf("Len:%d Index:%d, got 0x%08x want:0x%08x", len(y), i, got, want)
+                               }
+                       }
+               }
+       }
+}
+
  func TestDeflate(t *testing.T) {
         for _, h := range deflateTests {
                 var buf bytes.Buffer
@@ -91,7 +117,7 @@ func TestDeflate(t *testing.T) {
                 w.Write(h.in)
                 w.Close()
                 if !bytes.Equal(buf.Bytes(), h.out) {
-                       t.Errorf("Deflate(%d, %x) = %x, want %x", h.level, h.in, buf.Bytes(), h.out)
+                       t.Errorf("Deflate(%d, %x) = \n%#v, want \n%#v", h.level, h.in, buf.Bytes(), h.out)
                 }
         }
  }
@@ -289,6 +315,9 @@ func testToFromWithLevelAndLimit(t *testing.T, level int, input []byte, name str
                 t.Errorf("level: %d, len(compress(data)) = %d > limit = %d", level, buffer.Len(), limit)
                 return
         }
+       if limit > 0 {
+               t.Logf("level: %d, size:%.2f%%, %d b\n", level, float64(buffer.Len()*100)/float64(limit), buffer.Len())
+       }
         r := NewReader(&buffer)
         out, err := ioutil.ReadAll(r)
         if err != nil {
@@ -457,6 +486,17 @@ func TestWriterReset(t *testing.T) {
                 // DeepEqual doesn't compare functions.
                 w.d.fill, wref.d.fill = nil, nil
                 w.d.step, wref.d.step = nil, nil
+               w.d.bulkHasher, wref.d.bulkHasher = nil, nil
+               // hashMatch is always overwritten when used.
+               copy(w.d.hashMatch[:], wref.d.hashMatch[:])
+               if len(w.d.tokens) != 0 {
+                       t.Errorf("level %d Writer not reset after Reset. %d tokens were present", level, len(w.d.tokens))
+               }
+               // As long as the length is 0, we don't care about the content.
+               w.d.tokens = wref.d.tokens
+
+               // We don't care if there are values in the window, as long as it is at d.index is 0
+               w.d.window = wref.d.window
                 if !reflect.DeepEqual(w, wref) {
                         t.Errorf("level %d Writer not reset after Reset", level)
                 }
@@ -481,7 +521,7 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error))
                 w.Write(b)
         }
         w.Close()
-       out1 := buf.String()
+       out1 := buf.Bytes()
  
         buf2 := new(bytes.Buffer)
         w.Reset(buf2)
@@ -489,10 +529,23 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error))
                 w.Write(b)
         }
         w.Close()
-       out2 := buf2.String()
+       out2 := buf2.Bytes()
  
-       if out1 != out2 {
-               t.Errorf("got %q, expected %q", out2, out1)
+       if len(out1) != len(out2) {
+               t.Errorf("got %d, expected %d bytes", len(out2), len(out1))
+               return
+       }
+       if !bytes.Equal(out1, out2) {
+               mm := 0
+               for i, b := range out1[:len(out2)] {
+                       if b != out2[i] {
+                               t.Errorf("mismatch index %d: %#02x, expected %#02x", i, out2[i], b)
+                       }
+                       mm++
+                       if mm == 10 {
+                               t.Fatal("Stopping")
+                       }
+               }
         }
         t.Logf("got %d bytes", len(out1))
  }
author	Klaus Post <klauspost@gmail.com>
	Mon, 21 Mar 2016 09:21:55 +0000 (10:21 +0100)
committer	Nigel Tao <nigeltao@golang.org>
	Wed, 23 Mar 2016 11:33:29 +0000 (11:33 +0000)
src/compress/flate/deflate.go		patch \| blob \| history
src/compress/flate/deflate_test.go		patch \| blob \| history