From 6ec481b06c1ceba5792e355ca45f7476bb78f21f Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sun, 10 Apr 2016 12:00:13 +0200 Subject: [PATCH] compress/flate: use uncompressed if dynamic encoding is larger MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This adds size calculation to "dynamic" writes. This ensures that if dynamic Huffman encoding is bigger, or only slightly smaller than raw data, the block is written uncompressed. To minimize the code duplication of this function, the size calculation has been moved to separate functions. Since I was modifying these calculations, I changed "int64" size calculations to "int". Blocks are of very limited size, so there is not any risk of overflows. This should mainly improve 32 bit performance, but amd64 also gets a slight boost: name old time/op new time/op delta EncodeDigitsHuffman1e4-8 49.9µs ± 1% 49.3µs ± 1% -1.21% (p=0.000 n=10+10) EncodeDigitsHuffman1e5-8 476µs ± 1% 471µs ± 3% ~ (p=0.218 n=10+10) EncodeDigitsHuffman1e6-8 4.80ms ± 2% 4.75ms ± 2% ~ (p=0.243 n=10+9) EncodeDigitsSpeed1e4-8 305µs ± 3% 300µs ± 1% -1.86% (p=0.005 n=10+10) EncodeDigitsSpeed1e5-8 3.67ms ± 2% 3.58ms ± 1% -2.29% (p=0.000 n=9+8) EncodeDigitsSpeed1e6-8 38.3ms ± 2% 37.0ms ± 1% -3.45% (p=0.000 n=9+9) EncodeDigitsDefault1e4-8 361µs ± 2% 353µs ± 1% -2.21% (p=0.000 n=10+10) EncodeDigitsDefault1e5-8 5.24ms ± 2% 5.19ms ± 2% ~ (p=0.105 n=10+10) EncodeDigitsDefault1e6-8 56.5ms ± 3% 55.1ms ± 1% -2.42% (p=0.001 n=10+10) EncodeDigitsCompress1e4-8 362µs ± 2% 358µs ± 2% ~ (p=0.123 n=10+10) EncodeDigitsCompress1e5-8 5.26ms ± 3% 5.20ms ± 1% ~ (p=0.089 n=10+10) EncodeDigitsCompress1e6-8 56.0ms ± 4% 55.0ms ± 1% ~ (p=0.065 n=10+9) EncodeTwainHuffman1e4-8 70.9µs ± 3% 67.6µs ± 2% -4.59% (p=0.000 n=10+10) EncodeTwainHuffman1e5-8 556µs ± 2% 533µs ± 1% -4.20% (p=0.000 n=10+10) EncodeTwainHuffman1e6-8 5.54ms ± 3% 5.29ms ± 1% -4.37% (p=0.000 n=10+9) EncodeTwainSpeed1e4-8 294µs ± 3% 293µs ± 1% ~ (p=0.965 n=10+8) EncodeTwainSpeed1e5-8 2.59ms ± 2% 2.56ms ± 1% ~ (p=0.353 n=10+10) EncodeTwainSpeed1e6-8 25.6ms ± 1% 24.9ms ± 1% -2.62% (p=0.000 n=9+10) EncodeTwainDefault1e4-8 419µs ± 2% 417µs ± 1% ~ (p=0.780 n=10+9) EncodeTwainDefault1e5-8 6.23ms ± 4% 6.16ms ± 1% ~ (p=0.218 n=10+10) EncodeTwainDefault1e6-8 66.2ms ± 2% 65.7ms ± 1% ~ (p=0.529 n=10+10) EncodeTwainCompress1e4-8 426µs ± 1% 428µs ± 2% ~ (p=0.549 n=9+10) EncodeTwainCompress1e5-8 6.80ms ± 1% 6.85ms ± 3% ~ (p=0.156 n=9+10) EncodeTwainCompress1e6-8 74.6ms ± 3% 73.8ms ± 2% ~ (p=0.280 n=10+10) name old speed new speed delta EncodeDigitsHuffman1e4-8 200MB/s ± 1% 203MB/s ± 1% +1.23% (p=0.000 n=10+10) EncodeDigitsHuffman1e5-8 210MB/s ± 1% 212MB/s ± 3% ~ (p=0.356 n=10+9) EncodeDigitsHuffman1e6-8 208MB/s ± 2% 210MB/s ± 2% ~ (p=0.243 n=10+9) EncodeDigitsSpeed1e4-8 32.8MB/s ± 3% 33.4MB/s ± 1% +1.88% (p=0.005 n=10+10) EncodeDigitsSpeed1e5-8 27.2MB/s ± 2% 27.9MB/s ± 1% +2.60% (p=0.000 n=10+8) EncodeDigitsSpeed1e6-8 26.1MB/s ± 2% 27.0MB/s ± 1% +3.56% (p=0.000 n=9+9) EncodeDigitsDefault1e4-8 27.7MB/s ± 2% 28.4MB/s ± 1% +2.24% (p=0.000 n=10+10) EncodeDigitsDefault1e5-8 19.1MB/s ± 2% 19.3MB/s ± 2% ~ (p=0.101 n=10+10) EncodeDigitsDefault1e6-8 17.7MB/s ± 3% 18.1MB/s ± 1% +2.46% (p=0.001 n=10+10) EncodeDigitsCompress1e4-8 27.6MB/s ± 2% 27.9MB/s ± 2% ~ (p=0.119 n=10+10) EncodeDigitsCompress1e5-8 19.0MB/s ± 3% 19.2MB/s ± 1% ~ (p=0.085 n=10+10) EncodeDigitsCompress1e6-8 17.9MB/s ± 4% 18.1MB/s ± 3% ~ (p=0.110 n=10+10) EncodeTwainHuffman1e4-8 141MB/s ± 3% 148MB/s ± 2% +4.79% (p=0.000 n=10+10) EncodeTwainHuffman1e5-8 180MB/s ± 2% 188MB/s ± 1% +4.38% (p=0.000 n=10+10) EncodeTwainHuffman1e6-8 181MB/s ± 3% 189MB/s ± 1% +4.54% (p=0.000 n=10+9) EncodeTwainSpeed1e4-8 34.0MB/s ± 3% 34.1MB/s ± 1% ~ (p=0.948 n=10+8) EncodeTwainSpeed1e5-8 38.7MB/s ± 2% 39.0MB/s ± 1% ~ (p=0.353 n=10+10) EncodeTwainSpeed1e6-8 39.1MB/s ± 1% 40.1MB/s ± 1% +2.68% (p=0.000 n=9+10) EncodeTwainDefault1e4-8 23.9MB/s ± 2% 24.0MB/s ± 1% ~ (p=0.734 n=10+9) EncodeTwainDefault1e5-8 16.0MB/s ± 4% 16.2MB/s ± 1% ~ (p=0.210 n=10+10) EncodeTwainDefault1e6-8 15.1MB/s ± 2% 15.2MB/s ± 1% ~ (p=0.515 n=10+10) EncodeTwainCompress1e4-8 23.5MB/s ± 1% 23.4MB/s ± 2% ~ (p=0.536 n=9+10) EncodeTwainCompress1e5-8 14.7MB/s ± 1% 14.6MB/s ± 3% ~ (p=0.138 n=9+10) EncodeTwainCompress1e6-8 13.4MB/s ± 3% 13.5MB/s ± 2% ~ (p=0.239 n=10+10) This improves "random input" to the dynamic writer, which is why the test data is updated. The output size goes from 1051 to 1005 bytes. Change-Id: I3ee11d2d2511b277d2dd16734aeea07c98bca450 Reviewed-on: https://go-review.googlesource.com/21757 Reviewed-by: Joe Tsai Run-TryBot: Joe Tsai TryBot-Result: Gobot Gobot Reviewed-by: Nigel Tao --- src/compress/flate/huffman_bit_writer.go | 123 ++++++++++-------- src/compress/flate/huffman_code.go | 6 +- .../flate/testdata/huffman-rand-1k.dyn.expect | Bin 1054 -> 1005 bytes 3 files changed, 70 insertions(+), 59 deletions(-) diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go index d0206e59cf..c4adef9ff5 100644 --- a/src/compress/flate/huffman_bit_writer.go +++ b/src/compress/flate/huffman_bit_writer.go @@ -6,7 +6,6 @@ package flate import ( "io" - "math" ) const ( @@ -282,6 +281,46 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE codegen[outIndex] = badCode } +// dynamicSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) { + numCodegens = len(w.codegenFreq) + for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { + numCodegens-- + } + header := 3 + 5 + 5 + 4 + (3 * numCodegens) + + w.codegenEncoding.bitLength(w.codegenFreq[:]) + + int(w.codegenFreq[16])*2 + + int(w.codegenFreq[17])*3 + + int(w.codegenFreq[18])*7 + size = header + + litEnc.bitLength(w.literalFreq) + + offEnc.bitLength(w.offsetFreq) + + extraBits + + return size, numCodegens +} + +// fixedSize returns the size of dynamically encoded data in bits. +func (w *huffmanBitWriter) fixedSize(extraBits int) int { + return 3 + + fixedLiteralEncoding.bitLength(w.literalFreq) + + fixedOffsetEncoding.bitLength(w.offsetFreq) + + extraBits +} + +// storedSize calculates the stored size, including header. +// The function returns the size in bits and whether the block +// fits inside a single block. +func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { + if in == nil { + return 0, false + } + if len(in) <= maxStoreBlockSize { + return (len(in) + 5) * 8, true + } + return 0, false +} + func (w *huffmanBitWriter) writeCode(c hcode) { if w.err != nil { return @@ -384,6 +423,11 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) { w.writeBits(value, 3) } +// writeBlock will write a block of tokens with the smallest encoding. +// The original input can be supplied, and if the huffman encoded data +// is larger than the original bytes, the data will be written as a +// stored block. +// If the input is nil, the tokens will always be Huffman encoded. func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { if w.err != nil { return @@ -392,36 +436,28 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { tokens = append(tokens, endBlockMarker) numLiterals, numOffsets := w.indexTokens(tokens) - storedBytes := 0 - if input != nil { - storedBytes = len(input) - } - var extraBits int64 - var storedSize int64 = math.MaxInt64 - if storedBytes <= maxStoreBlockSize && input != nil { - storedSize = int64((storedBytes + 5) * 8) + var extraBits int + storedSize, storable := w.storedSize(input) + if storable { // We only bother calculating the costs of the extra bits required by // the length of offset fields (which will be the same for both fixed // and dynamic encoding), if we need to compare those two encodings // against stored encoding. for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ { // First eight length codes have extra size = 0. - extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart]) + extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart]) } for offsetCode := 4; offsetCode < numOffsets; offsetCode++ { // First four offset codes have extra size = 0. - extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode]) + extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode]) } } // Figure out smallest code. // Fixed Huffman baseline. - var size = int64(3) + - fixedLiteralEncoding.bitLength(w.literalFreq) + - fixedOffsetEncoding.bitLength(w.offsetFreq) + - extraBits var literalEncoding = fixedLiteralEncoding var offsetEncoding = fixedOffsetEncoding + var size = w.fixedSize(extraBits) // Dynamic Huffman? var numCodegens int @@ -430,19 +466,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) w.codegenEncoding.generate(w.codegenFreq[:], 7) - numCodegens = len(w.codegenFreq) - for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { - numCodegens-- - } - dynamicHeader := int64(3+5+5+4+(3*numCodegens)) + - w.codegenEncoding.bitLength(w.codegenFreq[:]) + - extraBits + - int64(w.codegenFreq[16]*2) + - int64(w.codegenFreq[17]*3) + - int64(w.codegenFreq[18]*7) - dynamicSize := dynamicHeader + - w.literalEncoding.bitLength(w.literalFreq) + - w.offsetEncoding.bitLength(w.offsetFreq) + dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits) if dynamicSize < size { size = dynamicSize @@ -451,9 +475,9 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { } // Stored bytes? - if storedSize < size { - w.writeStoredHeader(storedBytes, eof) - w.writeBytes(input[:storedBytes]) + if storable && storedSize < size { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) return } @@ -466,12 +490,13 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // Write the tokens. w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes) - } // writeBlockDynamic encodes a block using a dynamic Huffman table. // This should be used if the symbols used have a disproportionate // histogram distribution. +// If input is supplied and the compression savings are below 1/16th of the +// input size the block is stored. func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) { if w.err != nil { return @@ -484,9 +509,13 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []b // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) w.codegenEncoding.generate(w.codegenFreq[:], 7) - numCodegens := len(w.codegenFreq) - for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { - numCodegens-- + size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0) + + // Store bytes, if we don't get a reasonable improvement. + if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + w.writeStoredHeader(len(input), eof) + w.writeBytes(input) + return } // Write Huffman table. @@ -611,29 +640,11 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) w.codegenEncoding.generate(w.codegenFreq[:], 7) - numCodegens = len(w.codegenFreq) - for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { - numCodegens-- - } - headerSize := int64(3+5+5+4+(3*numCodegens)) + - w.codegenEncoding.bitLength(w.codegenFreq[:]) + - int64(w.codegenFreq[16]*2) + - int64(w.codegenFreq[17]*3) + - int64(w.codegenFreq[18]*7) - - // Includes EOB marker - size := headerSize + w.literalEncoding.bitLength(w.literalFreq) - - // Calculate stored size - var storedSize int64 = math.MaxInt64 - var storedBytes = len(input) - if storedBytes <= maxStoreBlockSize { - storedSize = int64(storedBytes+5) * 8 - } + size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0) // Store bytes, if we don't get a reasonable improvement. - if storedSize < (size + size>>4) { - w.writeStoredHeader(storedBytes, eof) + if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) { + w.writeStoredHeader(len(input), eof) w.writeBytes(input) return } diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go index 20fb19090d..bdcbd823b0 100644 --- a/src/compress/flate/huffman_code.go +++ b/src/compress/flate/huffman_code.go @@ -105,11 +105,11 @@ func generateFixedOffsetEncoding() *huffmanEncoder { var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding() var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding() -func (h *huffmanEncoder) bitLength(freq []int32) int64 { - var total int64 +func (h *huffmanEncoder) bitLength(freq []int32) int { + var total int for i, f := range freq { if f != 0 { - total += int64(f) * int64(h.codes[i].len) + total += int(f) * int(h.codes[i].len) } } return total diff --git a/src/compress/flate/testdata/huffman-rand-1k.dyn.expect b/src/compress/flate/testdata/huffman-rand-1k.dyn.expect index 0c24742fde2487e3a454ec3364f15e541693c37c..09dc798ee37df82176b8b7c9998c88a14207c1ad 100644 GIT binary patch literal 1005 zcmVlcQ}x5eHD>U|iC=ROD8-~ViL-puE1 zjRYA__oQ{&>YEB=3*aLuz4zyXJp13Xu1};#Rhix|mTnwF zOo!rp*PZhF=TqnOy;6>9pEFaaeUqI8B!YL)2W zP7ZdtNvU6;rei#QejpQ1yJnKOE~NTM%dWXRuhSpl)r~@J@cfJn0Ny~Wi$|AEsLzhu zri&m6gnDM>m?;94<~TB71LK+=ROn-XNSxENOU6sujQmH^hn%vbF>Y9-Bf>bg4ep_N_banGD$o@)BlG0~`IFf*!A z7ZZY+$P{3oO)_oT873jzel8_va>@^q&Gy#Imx?o3b8wLzzbGT44Do}*$X0h~ljl$J4Xnb zbD&&|U+WJ#!b4}YW@ms{4#Dg|)FPD1`RJ15X*j-TWXe#-24_NUqwu$E^5|c&ujkvl zceVJ-2*h=M!1)}1Jc%#TSUTePk+ypzC+V()i{5ms{n@u^D(o_E@REe_Kn#k!Ic_d< z)NYD&D%@ZnqX*t~i*(5TV|DgDW2`fY!|?bmYqXwpi(E6b%BbX-wveIk57S|?#u}7- zL{;=f|DL5<#-Qjb!HsV;5xKrj*@u^N&pjiq)f!%|U1|gQA`KAPM`;y5?oy)&(mYZ0 z_?_gKiO6R;)m}AtC+IwYu6c3Nlk}=l5*$k#%8*z(mO5DYDWih#pN0k_;dS~5vECO-S0Dj5 literal 1054 zcmV+(1mXJxzzaAU2mk=!nayIXbMy_f)7H$mL&SF;F?3`%k8@)&&%@Oe(UOiioadDG zS>BI}35WJ&PF@*1*&LbA=aF5pFj3x*HIFRrKcto>d1~bp8)vlgPG~al`sLh_uD4>f zwcquqQs)bz`O{dU_?0E5ZyfOj3vL$R|;Io1R(-}eKi+pE+?-hv`IeFsDFRE4SU5j~y=5(3C6?qYw^br64(dswwJMG1iwh9bz5{6%{CK{d z?OTrws1RG0;tdgAc^^}S;a;h-Le*Jl$;@?4WVbi2?}j$(yZ8P0lo@^JyA?I@?GEt7oU6m&;AhmaN!WN2o4Ue&a8T%J8g~M#1p4zh)_hxG4z2`Ogny za;mxRW4Md@6TRsPIrIrmbY`0*@-5uMh;C)*<4Qh|=G6i5GP){GL)z@9EkaXFMahfN zv?c%P&)d;?j&h!ypwqm%P^YHL3jM3}%*^0B)TTYwcr0m+>#+B{By^cDULDE6Dg;&& zO=u{r#qY9CX2q~>M2)v~oJjxXwYfA4W6UEykUq9QGg?N01rigUU44BE!qnW&8XUe) zez=s$?Lpl~RS(YSo`<)!77bHS>Gu?tEqHX60yc4tb%nr56)BI?!K^R=U-@BSOT=w5 zsVIvXfM*tHgqR0EakjC|{oW&au|@y5MGR8cGC-Yn06%^E0PC$!Cb-Z0wr}jN>)ms9 zL;@;_wIK!J>p%w{0>eRLG6F9RY`9EcFXkV~=#m(_eoQp~r+?KjZg}QSSL~iFyscF_ z+e_{^90j}~rTuecm=4dkoD6jMc=)XI@ePwzb~aU<_(Cb{iZR=;j^CkY@49GGUfJU< zh|_pVqS;)85%YqWL`@t%i6ZSgMZJLK5AGbA<2Z~&Y6y(OZ<17W7CzqwPW|%tkU??k z4*_!bQ%1vsp<0>Q04?4|yQkkrm{&#|cY?4524U<_tm@Hqt*?a07|`vlpP7J3xS#y+ zPf2l=uqk^9aS%w&GBx^|J3nR zNecGe6yILAWA?u!e(Cl<@PcA2?CSjWxPaGt_JWfJ*C8~T`^Pp$vI5uS*J~uVqo@>8 Yxt^P)DKm4sCRYuzNKydW#Fu~kAGX;UBLDyZ -- 2.48.1