compress/flate: use uncompressed if dynamic encoding is larger

author Klaus Post <klauspost@gmail.com>

Sun, 10 Apr 2016 10:00:13 +0000 (12:00 +0200)

committer Nigel Tao <nigeltao@golang.org>

Mon, 18 Apr 2016 02:30:46 +0000 (02:30 +0000)
author Klaus Post <klauspost@gmail.com>
Sun, 10 Apr 2016 10:00:13 +0000 (12:00 +0200)
committer Nigel Tao <nigeltao@golang.org>
Mon, 18 Apr 2016 02:30:46 +0000 (02:30 +0000)
diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go

index d0206e59cf911881cfe19991cac5981008c5f258..c4adef9ff53010b6db8d86f824bf235dc8b87ffd 100644 (file)
--- a/src/compress/flate/huffman_bit_writer.go
+++ b/src/compress/flate/huffman_bit_writer.go
@@ -6,7 +6,6 @@ package flate
  
  import (
         "io"
-       "math"
  )
  
  const (
@@ -282,6 +281,46 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
         codegen[outIndex] = badCode
  }
  
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+       numCodegens = len(w.codegenFreq)
+       for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+               numCodegens--
+       }
+       header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
+               w.codegenEncoding.bitLength(w.codegenFreq[:]) +
+               int(w.codegenFreq[16])*2 +
+               int(w.codegenFreq[17])*3 +
+               int(w.codegenFreq[18])*7
+       size = header +
+               litEnc.bitLength(w.literalFreq) +
+               offEnc.bitLength(w.offsetFreq) +
+               extraBits
+
+       return size, numCodegens
+}
+
+// fixedSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) fixedSize(extraBits int) int {
+       return 3 +
+               fixedLiteralEncoding.bitLength(w.literalFreq) +
+               fixedOffsetEncoding.bitLength(w.offsetFreq) +
+               extraBits
+}
+
+// storedSize calculates the stored size, including header.
+// The function returns the size in bits and whether the block
+// fits inside a single block.
+func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
+       if in == nil {
+               return 0, false
+       }
+       if len(in) <= maxStoreBlockSize {
+               return (len(in) + 5) * 8, true
+       }
+       return 0, false
+}
+
  func (w *huffmanBitWriter) writeCode(c hcode) {
         if w.err != nil {
                 return
@@ -384,6 +423,11 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
         w.writeBits(value, 3)
  }
  
+// writeBlock will write a block of tokens with the smallest encoding.
+// The original input can be supplied, and if the huffman encoded data
+// is larger than the original bytes, the data will be written as a
+// stored block.
+// If the input is nil, the tokens will always be Huffman encoded.
  func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
         if w.err != nil {
                 return
@@ -392,36 +436,28 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
         tokens = append(tokens, endBlockMarker)
         numLiterals, numOffsets := w.indexTokens(tokens)
  
-       storedBytes := 0
-       if input != nil {
-               storedBytes = len(input)
-       }
-       var extraBits int64
-       var storedSize int64 = math.MaxInt64
-       if storedBytes <= maxStoreBlockSize && input != nil {
-               storedSize = int64((storedBytes + 5) * 8)
+       var extraBits int
+       storedSize, storable := w.storedSize(input)
+       if storable {
                 // We only bother calculating the costs of the extra bits required by
                 // the length of offset fields (which will be the same for both fixed
                 // and dynamic encoding), if we need to compare those two encodings
                 // against stored encoding.
                 for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
                         // First eight length codes have extra size = 0.
-                       extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart])
+                       extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
                 }
                 for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
                         // First four offset codes have extra size = 0.
-                       extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
+                       extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
                 }
         }
  
         // Figure out smallest code.
         // Fixed Huffman baseline.
-       var size = int64(3) +
-               fixedLiteralEncoding.bitLength(w.literalFreq) +
-               fixedOffsetEncoding.bitLength(w.offsetFreq) +
-               extraBits
         var literalEncoding = fixedLiteralEncoding
         var offsetEncoding = fixedOffsetEncoding
+       var size = w.fixedSize(extraBits)
  
         // Dynamic Huffman?
         var numCodegens int
@@ -430,19 +466,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
         // the literalEncoding and the offsetEncoding.
         w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
         w.codegenEncoding.generate(w.codegenFreq[:], 7)
-       numCodegens = len(w.codegenFreq)
-       for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
-               numCodegens--
-       }
-       dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
-               w.codegenEncoding.bitLength(w.codegenFreq[:]) +
-               extraBits +
-               int64(w.codegenFreq[16]*2) +
-               int64(w.codegenFreq[17]*3) +
-               int64(w.codegenFreq[18]*7)
-       dynamicSize := dynamicHeader +
-               w.literalEncoding.bitLength(w.literalFreq) +
-               w.offsetEncoding.bitLength(w.offsetFreq)
+       dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
  
         if dynamicSize < size {
                 size = dynamicSize
@@ -451,9 +475,9 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
         }
  
         // Stored bytes?
-       if storedSize < size {
-               w.writeStoredHeader(storedBytes, eof)
-               w.writeBytes(input[:storedBytes])
+       if storable && storedSize < size {
+               w.writeStoredHeader(len(input), eof)
+               w.writeBytes(input)
                 return
         }
  
@@ -466,12 +490,13 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
  
         // Write the tokens.
         w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
-
  }
  
  // writeBlockDynamic encodes a block using a dynamic Huffman table.
  // This should be used if the symbols used have a disproportionate
  // histogram distribution.
+// If input is supplied and the compression savings are below 1/16th of the
+// input size the block is stored.
  func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
         if w.err != nil {
                 return
@@ -484,9 +509,13 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []b
         // the literalEncoding and the offsetEncoding.
         w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
         w.codegenEncoding.generate(w.codegenFreq[:], 7)
-       numCodegens := len(w.codegenFreq)
-       for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
-               numCodegens--
+       size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
+
+       // Store bytes, if we don't get a reasonable improvement.
+       if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+               w.writeStoredHeader(len(input), eof)
+               w.writeBytes(input)
+               return
         }
  
         // Write Huffman table.
@@ -611,29 +640,11 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
         // the literalEncoding and the offsetEncoding.
         w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
         w.codegenEncoding.generate(w.codegenFreq[:], 7)
-       numCodegens = len(w.codegenFreq)
-       for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
-               numCodegens--
-       }
-       headerSize := int64(3+5+5+4+(3*numCodegens)) +
-               w.codegenEncoding.bitLength(w.codegenFreq[:]) +
-               int64(w.codegenFreq[16]*2) +
-               int64(w.codegenFreq[17]*3) +
-               int64(w.codegenFreq[18]*7)
-
-       // Includes EOB marker
-       size := headerSize + w.literalEncoding.bitLength(w.literalFreq)
-
-       // Calculate stored size
-       var storedSize int64 = math.MaxInt64
-       var storedBytes = len(input)
-       if storedBytes <= maxStoreBlockSize {
-               storedSize = int64(storedBytes+5) * 8
-       }
+       size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
  
         // Store bytes, if we don't get a reasonable improvement.
-       if storedSize < (size + size>>4) {
-               w.writeStoredHeader(storedBytes, eof)
+       if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+               w.writeStoredHeader(len(input), eof)
                 w.writeBytes(input)
                 return
         }
diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go

index 20fb19090d61b5ffa94f47682505b3f1ff5ed25a..bdcbd823b00a79efd79fa9ef2a040f7c01d57edb 100644 (file)
--- a/src/compress/flate/huffman_code.go
+++ b/src/compress/flate/huffman_code.go
@@ -105,11 +105,11 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
  var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
  var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
  
-func (h *huffmanEncoder) bitLength(freq []int32) int64 {
-       var total int64
+func (h *huffmanEncoder) bitLength(freq []int32) int {
+       var total int
         for i, f := range freq {
                 if f != 0 {
-                       total += int64(f) * int64(h.codes[i].len)
+                       total += int(f) * int(h.codes[i].len)
                 }
         }
         return total
diff --git a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect

index 0c24742fde2487e3a454ec3364f15e541693c37c..09dc798ee37df82176b8b7c9998c88a14207c1ad 100644 (file)

Binary files a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect and b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect differ
author	Klaus Post <klauspost@gmail.com>
	Sun, 10 Apr 2016 10:00:13 +0000 (12:00 +0200)
committer	Nigel Tao <nigeltao@golang.org>
	Mon, 18 Apr 2016 02:30:46 +0000 (02:30 +0000)
src/compress/flate/huffman_bit_writer.go		patch \| blob \| history
src/compress/flate/huffman_code.go		patch \| blob \| history
src/compress/flate/testdata/huffman-rand-1k.dyn.expect		patch \| blob \| history