compress/flate: distinguish between base and min match length.

author Nigel Tao <nigeltao@golang.org>

Wed, 4 May 2016 20:43:13 +0000 (06:43 +1000)

committer Nigel Tao <nigeltao@golang.org>

Thu, 5 May 2016 00:16:39 +0000 (00:16 +0000)
author Nigel Tao <nigeltao@golang.org>
Wed, 4 May 2016 20:43:13 +0000 (06:43 +1000)
committer Nigel Tao <nigeltao@golang.org>
Thu, 5 May 2016 00:16:39 +0000 (00:16 +0000)
diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go

index f7ba02fe4ee8bfdbfb7cb0ab789e916e1e53a6a3..8467d7749d09915f01ad15a4f5b509c526417273 100644 (file)
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -19,12 +19,20 @@ const (
         logWindowSize      = 15
         windowSize         = 1 << logWindowSize
         windowMask         = windowSize - 1
-       logMaxOffsetSize   = 15  // Standard DEFLATE
-       minMatchLength     = 4   // The smallest match that the compressor looks for
-       maxMatchLength     = 258 // The longest match for the compressor
-       minOffsetSize      = 1   // The shortest offset that makes any sense
  
-       // The maximum number of tokens we put into a single flat block, just to
+       // The LZ77 step produces a sequence of literal tokens and <length, offset>
+       // pair tokens. The offset is also known as distance. The underlying wire
+       // format limits the range of lengths and offsets. For example, there are
+       // 256 legitimate lengths: those in the range [3, 258]. This package's
+       // compressor uses a higher minimum match length, enabling optimizations
+       // such as finding matches via 32-bit loads and compares.
+       baseMatchLength = 3       // The smallest match length per the RFC section 3.2.5
+       minMatchLength  = 4       // The smallest match length that the compressor actually emits
+       maxMatchLength  = 258     // The largest match length
+       baseMatchOffset = 1       // The smallest match offset
+       maxMatchOffset  = 1 << 15 // The largest match offset
+
+       // The maximum number of tokens we put into a single flate block, just to
         // stop things from getting too large.
         maxFlateBlockTokens = 1 << 14
         maxStoreBlockSize   = 65535
@@ -424,9 +432,9 @@ Loop:
                         // There was a match at the previous step, and the current match is
                         // not better. Output the previous match.
                         if d.fastSkipHashing != skipNever {
-                               d.tokens = append(d.tokens, matchToken(uint32(d.length-3), uint32(d.offset-minOffsetSize)))
+                               d.tokens = append(d.tokens, matchToken(uint32(d.length-baseMatchLength), uint32(d.offset-baseMatchOffset)))
                         } else {
-                               d.tokens = append(d.tokens, matchToken(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)))
+                               d.tokens = append(d.tokens, matchToken(uint32(prevLength-baseMatchLength), uint32(prevOffset-baseMatchOffset)))
                         }
                         // Insert in the hash table all strings up to the end of the match.
                         // index and index-1 are already inserted. If there is not enough
diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go

index 6cff27c00ab14f8ec2c2c2e3f53dbeb961acc813..6b881a477c4e2afcd5735ec686327dc6ced893fa 100644 (file)
--- a/src/compress/flate/deflatefast.go
+++ b/src/compress/flate/deflatefast.go
@@ -8,8 +8,6 @@ package flate
  // based on Snappy's LZ77-style encoder: github.com/golang/snappy
  
  const (
-       maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset.
-
         tableBits  = 14             // Bits used in the table.
         tableSize  = 1 << tableBits // Size of the table.
         tableMask  = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
@@ -97,7 +95,8 @@ func encodeBestSpeed(dst []token, src []byte) []token {
                         candidate = int(table[nextHash&tableMask])
                         table[nextHash&tableMask] = uint16(s)
                         nextHash = hash(load32(src, nextS))
-                       if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) {
+                       // TODO: < should be <=, and add a test for that.
+                       if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) {
                                 break
                         }
                 }
@@ -133,7 +132,7 @@ func encodeBestSpeed(dst []token, src []byte) []token {
                         }
  
                         // matchToken is flate's equivalent of Snappy's emitCopy.
-                       dst = append(dst, matchToken(uint32(s-base-3), uint32(base-candidate-minOffsetSize)))
+                       dst = append(dst, matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset)))
                         nextEmit = s
                         if s >= sLimit {
                                 goto emitRemainder
@@ -151,7 +150,8 @@ func encodeBestSpeed(dst []token, src []byte) []token {
                         currHash := hash(uint32(x >> 8))
                         candidate = int(table[currHash&tableMask])
                         table[currHash&tableMask] = uint16(s)
-                       if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) {
+                       // TODO: >= should be >, and add a test for that.
+                       if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
                                 nextHash = hash(uint32(x >> 16))
                                 s++
                                 break
diff --git a/src/compress/flate/inflate.go b/src/compress/flate/inflate.go

index d5f55eab342017a9b6ca6ccbb7d108870ab2e0f9..c1a4b60cd776b131931aa7ddbcc98305c4fc70a0 100644 (file)
--- a/src/compress/flate/inflate.go
+++ b/src/compress/flate/inflate.go
@@ -15,8 +15,7 @@ import (
  )
  
  const (
-       maxCodeLen = 16    // max length of Huffman code
-       maxHist    = 32768 // max history required
+       maxCodeLen = 16 // max length of Huffman code
         // The next three numbers come from the RFC section 3.2.7, with the
         // additional proviso in section 3.2.5 which implies that distance codes
         // 30 and 31 should never occur in compressed data.
@@ -767,7 +766,7 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
                 dict:     f.dict,
                 step:     (*decompressor).nextBlock,
         }
-       f.dict.init(maxHist, nil)
+       f.dict.init(maxMatchOffset, nil)
         return nil
  }
  
@@ -787,7 +786,7 @@ func NewReader(r io.Reader) io.ReadCloser {
         f.bits = new([maxNumLit + maxNumDist]int)
         f.codebits = new([numCodes]int)
         f.step = (*decompressor).nextBlock
-       f.dict.init(maxHist, nil)
+       f.dict.init(maxMatchOffset, nil)
         return &f
  }
  
@@ -806,6 +805,6 @@ func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
         f.bits = new([maxNumLit + maxNumDist]int)
         f.codebits = new([numCodes]int)
         f.step = (*decompressor).nextBlock
-       f.dict.init(maxHist, dict)
+       f.dict.init(maxMatchOffset, dict)
         return &f
  }
author	Nigel Tao <nigeltao@golang.org>
	Wed, 4 May 2016 20:43:13 +0000 (06:43 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Thu, 5 May 2016 00:16:39 +0000 (00:16 +0000)
src/compress/flate/deflate.go		patch \| blob \| history
src/compress/flate/deflatefast.go		patch \| blob \| history
src/compress/flate/inflate.go		patch \| blob \| history