From f20b1809f213c662932106a68c76ea3545eab1ee Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sun, 10 Apr 2016 13:43:24 +0200 Subject: [PATCH] compress/flate: eliminate most common bounds checks MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This uses the SSA compiler to eliminate various unneeded bounds checks in loops and various lookups. This fixes the low hanging fruit, without any major code changes. name old time/op new time/op delta EncodeDigitsHuffman1e4-8 49.9µs ± 1% 48.1µs ± 1% -3.74% (p=0.000 n=10+9) EncodeDigitsHuffman1e5-8 476µs ± 1% 458µs ± 1% -3.58% (p=0.000 n=10+10) EncodeDigitsHuffman1e6-8 4.80ms ± 2% 4.56ms ± 1% -5.07% (p=0.000 n=10+9) EncodeDigitsSpeed1e4-8 305µs ± 3% 290µs ± 2% -5.03% (p=0.000 n=10+9) EncodeDigitsSpeed1e5-8 3.67ms ± 2% 3.49ms ± 2% -4.78% (p=0.000 n=9+10) EncodeDigitsSpeed1e6-8 38.3ms ± 2% 35.8ms ± 1% -6.58% (p=0.000 n=9+10) EncodeDigitsDefault1e4-8 361µs ± 2% 346µs ± 3% -4.12% (p=0.000 n=10+9) EncodeDigitsDefault1e5-8 5.24ms ± 2% 4.96ms ± 3% -5.38% (p=0.000 n=10+10) EncodeDigitsDefault1e6-8 56.5ms ± 3% 52.2ms ± 2% -7.68% (p=0.000 n=10+10) EncodeDigitsCompress1e4-8 362µs ± 2% 343µs ± 1% -5.20% (p=0.000 n=10+9) EncodeDigitsCompress1e5-8 5.26ms ± 3% 4.98ms ± 2% -5.48% (p=0.000 n=10+10) EncodeDigitsCompress1e6-8 56.0ms ± 4% 52.1ms ± 1% -7.01% (p=0.000 n=10+10) EncodeTwainHuffman1e4-8 70.9µs ± 3% 64.7µs ± 1% -8.68% (p=0.000 n=10+9) EncodeTwainHuffman1e5-8 556µs ± 2% 524µs ± 2% -5.84% (p=0.000 n=10+10) EncodeTwainHuffman1e6-8 5.54ms ± 3% 5.22ms ± 2% -5.70% (p=0.000 n=10+10) EncodeTwainSpeed1e4-8 294µs ± 3% 284µs ± 1% -3.71% (p=0.000 n=10+10) EncodeTwainSpeed1e5-8 2.59ms ± 2% 2.48ms ± 1% -4.14% (p=0.000 n=10+9) EncodeTwainSpeed1e6-8 25.6ms ± 1% 24.3ms ± 1% -5.28% (p=0.000 n=9+10) EncodeTwainDefault1e4-8 419µs ± 2% 396µs ± 1% -5.59% (p=0.000 n=10+9) EncodeTwainDefault1e5-8 6.23ms ± 4% 5.75ms ± 1% -7.83% (p=0.000 n=10+9) EncodeTwainDefault1e6-8 66.2ms ± 2% 61.4ms ± 1% -7.22% (p=0.000 n=10+10) EncodeTwainCompress1e4-8 426µs ± 1% 405µs ± 1% -4.97% (p=0.000 n=9+10) EncodeTwainCompress1e5-8 6.80ms ± 1% 6.32ms ± 1% -6.97% (p=0.000 n=9+10) EncodeTwainCompress1e6-8 74.6ms ± 3% 68.7ms ± 1% -7.90% (p=0.000 n=10+9) name old speed new speed delta EncodeDigitsHuffman1e4-8 200MB/s ± 1% 208MB/s ± 1% +3.88% (p=0.000 n=10+9) EncodeDigitsHuffman1e5-8 210MB/s ± 1% 218MB/s ± 1% +3.71% (p=0.000 n=10+10) EncodeDigitsHuffman1e6-8 208MB/s ± 2% 219MB/s ± 1% +5.32% (p=0.000 n=10+9) EncodeDigitsSpeed1e4-8 32.8MB/s ± 3% 34.5MB/s ± 2% +5.29% (p=0.000 n=10+9) EncodeDigitsSpeed1e5-8 27.2MB/s ± 2% 28.6MB/s ± 2% +5.29% (p=0.000 n=10+10) EncodeDigitsSpeed1e6-8 26.1MB/s ± 2% 27.9MB/s ± 1% +7.02% (p=0.000 n=9+10) EncodeDigitsDefault1e4-8 27.7MB/s ± 2% 28.9MB/s ± 3% +4.30% (p=0.000 n=10+9) EncodeDigitsDefault1e5-8 19.1MB/s ± 2% 20.2MB/s ± 3% +5.69% (p=0.000 n=10+10) EncodeDigitsDefault1e6-8 17.7MB/s ± 3% 19.2MB/s ± 2% +8.31% (p=0.000 n=10+10) EncodeDigitsCompress1e4-8 27.6MB/s ± 2% 29.1MB/s ± 1% +5.47% (p=0.000 n=10+9) EncodeDigitsCompress1e5-8 19.0MB/s ± 3% 20.1MB/s ± 2% +5.78% (p=0.000 n=10+10) EncodeDigitsCompress1e6-8 17.9MB/s ± 4% 19.2MB/s ± 1% +7.50% (p=0.000 n=10+10) EncodeTwainHuffman1e4-8 141MB/s ± 3% 154MB/s ± 1% +9.46% (p=0.000 n=10+9) EncodeTwainHuffman1e5-8 180MB/s ± 2% 191MB/s ± 2% +6.19% (p=0.000 n=10+10) EncodeTwainHuffman1e6-8 181MB/s ± 3% 192MB/s ± 2% +6.02% (p=0.000 n=10+10) EncodeTwainSpeed1e4-8 34.0MB/s ± 3% 35.3MB/s ± 1% +3.84% (p=0.000 n=10+10) EncodeTwainSpeed1e5-8 38.7MB/s ± 2% 40.3MB/s ± 1% +4.30% (p=0.000 n=10+9) EncodeTwainSpeed1e6-8 39.1MB/s ± 1% 41.2MB/s ± 1% +5.57% (p=0.000 n=9+10) EncodeTwainDefault1e4-8 23.9MB/s ± 2% 25.3MB/s ± 1% +5.91% (p=0.000 n=10+9) EncodeTwainDefault1e5-8 16.0MB/s ± 4% 17.4MB/s ± 1% +8.47% (p=0.000 n=10+9) EncodeTwainDefault1e6-8 15.1MB/s ± 2% 16.3MB/s ± 1% +7.76% (p=0.000 n=10+10) EncodeTwainCompress1e4-8 23.5MB/s ± 1% 24.7MB/s ± 1% +5.24% (p=0.000 n=9+10) EncodeTwainCompress1e5-8 14.7MB/s ± 1% 15.8MB/s ± 1% +7.50% (p=0.000 n=9+10) EncodeTwainCompress1e6-8 13.4MB/s ± 3% 14.6MB/s ± 1% +8.57% (p=0.000 n=10+9) Change-Id: I5c7e84c2f9ea4d38a2115995705eebb93387e22f Reviewed-on: https://go-review.googlesource.com/21759 Reviewed-by: Brad Fitzpatrick Run-TryBot: Brad Fitzpatrick TryBot-Result: Gobot Gobot --- src/compress/flate/deflate.go | 37 ++++++++-------- src/compress/flate/huffman_bit_writer.go | 55 +++++++++++++----------- src/compress/flate/huffman_code.go | 4 +- 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go index 3bb8b5e02a..d8bbffbc66 100644 --- a/src/compress/flate/deflate.go +++ b/src/compress/flate/deflate.go @@ -73,8 +73,8 @@ type compressor struct { // hashPrev[hashHead[hashValue] & windowMask] contains the previous index // with the same hash value. chainHead int - hashHead []uint32 - hashPrev []uint32 + hashHead [hashSize]uint32 + hashPrev [windowSize]uint32 hashOffset int // input window: unprocessed data is window[index:windowEnd] @@ -188,12 +188,13 @@ func (d *compressor) fillWindow(b []byte) { var newH uint32 for i, val := range dst { di := i + index - newH = val & hashMask + newH = val + hh := &d.hashHead[newH&hashMask] // Get previous value with the same hash. // Our chain should point to the previous value. - d.hashPrev[di&windowMask] = d.hashHead[newH] + d.hashPrev[di&windowMask] = *hh // Set the head of the hash chain to us. - d.hashHead[newH] = uint32(di + d.hashOffset) + *hh = uint32(di + d.hashOffset) } d.hash = newH } @@ -293,6 +294,7 @@ func bulkHash4(b []byte, dst []uint32) { // bytes in size. func matchLen(a, b []byte, max int) int { a = a[:max] + b = b[:len(a)] for i, av := range a { if b[i] != av { return i @@ -302,8 +304,6 @@ func matchLen(a, b []byte, max int) int { } func (d *compressor) initDeflate() { - d.hashHead = make([]uint32, hashSize) - d.hashPrev = make([]uint32, windowSize) d.window = make([]byte, 2*windowSize) d.hashOffset = 1 d.tokens = make([]token, 0, maxFlateBlockTokens+1) @@ -358,9 +358,10 @@ Loop: if d.index < d.maxInsertIndex { // Update the hash d.hash = hash4(d.window[d.index : d.index+minMatchLength]) - d.chainHead = int(d.hashHead[d.hash]) + hh := &d.hashHead[d.hash&hashMask] + d.chainHead = int(*hh) d.hashPrev[d.index&windowMask] = uint32(d.chainHead) - d.hashHead[d.hash] = uint32(d.index + d.hashOffset) + *hh = uint32(d.index + d.hashOffset) } prevLength := d.length prevOffset := d.offset @@ -404,9 +405,10 @@ Loop: d.hash = hash4(d.window[d.index : d.index+minMatchLength]) // Get previous value with the same hash. // Our chain should point to the previous value. - d.hashPrev[d.index&windowMask] = d.hashHead[d.hash] + hh := &d.hashHead[d.hash&hashMask] + d.hashPrev[d.index&windowMask] = *hh // Set the head of the hash chain to us. - d.hashHead[d.hash] = uint32(d.index + d.hashOffset) + *hh = uint32(d.index + d.hashOffset) } } if d.fastSkipHashing == skipNever { @@ -531,9 +533,6 @@ func (d *compressor) init(w io.Writer, level int) (err error) { return nil } -// hzeroes is used for zeroing the hash slice. -var hzeroes [256]uint32 - func (d *compressor) reset(w io.Writer) { d.w.reset(w) d.sync = false @@ -543,15 +542,13 @@ func (d *compressor) reset(w io.Writer) { d.windowEnd = 0 default: d.chainHead = -1 - for s := d.hashHead; len(s) > 0; { - n := copy(s, hzeroes[:]) - s = s[n:] + for i := range d.hashHead { + d.hashHead[i] = 0 } - for s := d.hashPrev; len(s) > 0; s = s[len(hzeroes):] { - copy(s, hzeroes[:]) + for i := range d.hashPrev { + d.hashPrev[i] = 0 } d.hashOffset = 1 - d.index, d.windowEnd = 0, 0 d.blockStart, d.byteAvailable = 0, false d.tokens = d.tokens[:0] diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go index b99f86ea13..23f242f88e 100644 --- a/src/compress/flate/huffman_bit_writer.go +++ b/src/compress/flate/huffman_bit_writer.go @@ -84,11 +84,11 @@ type huffmanBitWriter struct { bits uint64 nbits uint bytes [bufferSize]byte + codegenFreq [codegenCodeCount]int32 nbytes int literalFreq []int32 offsetFreq []int32 codegen []uint8 - codegenFreq []int32 literalEncoding *huffmanEncoder offsetEncoding *huffmanEncoder codegenEncoding *huffmanEncoder @@ -101,7 +101,6 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { literalFreq: make([]int32, maxNumLit), offsetFreq: make([]int32, offsetCodeCount), codegen: make([]uint8, maxNumLit+offsetCodeCount+1), - codegenFreq: make([]int32, codegenCodeCount), literalEncoding: newHuffmanEncoder(maxNumLit), codegenEncoding: newHuffmanEncoder(codegenCodeCount), offsetEncoding: newHuffmanEncoder(offsetCodeCount), @@ -143,12 +142,13 @@ func (w *huffmanBitWriter) writeBits(b int32, nb uint) { w.bits >>= 48 w.nbits -= 48 n := w.nbytes - w.bytes[n+0] = byte(bits) - w.bytes[n+1] = byte(bits >> 8) - w.bytes[n+2] = byte(bits >> 16) - w.bytes[n+3] = byte(bits >> 24) - w.bytes[n+4] = byte(bits >> 32) - w.bytes[n+5] = byte(bits >> 40) + bytes := w.bytes[n : n+6] + bytes[0] = byte(bits) + bytes[1] = byte(bits >> 8) + bytes[2] = byte(bits >> 16) + bytes[3] = byte(bits >> 24) + bytes[4] = byte(bits >> 32) + bytes[5] = byte(bits >> 40) n += 6 if n >= bufferFlushSize { _, w.err = w.w.Write(w.bytes[:n]) @@ -293,12 +293,13 @@ func (w *huffmanBitWriter) writeCode(c hcode) { w.bits >>= 48 w.nbits -= 48 n := w.nbytes - w.bytes[n+0] = byte(bits) - w.bytes[n+1] = byte(bits >> 8) - w.bytes[n+2] = byte(bits >> 16) - w.bytes[n+3] = byte(bits >> 24) - w.bytes[n+4] = byte(bits >> 32) - w.bytes[n+5] = byte(bits >> 40) + bytes := w.bytes[n : n+6] + bytes[0] = byte(bits) + bytes[1] = byte(bits >> 8) + bytes[2] = byte(bits >> 16) + bytes[3] = byte(bits >> 24) + bytes[4] = byte(bits >> 32) + bytes[5] = byte(bits >> 40) n += 6 if n >= bufferFlushSize { _, w.err = w.w.Write(w.bytes[:n]) @@ -428,13 +429,13 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { // Generate codegen and codegenFrequencies, which indicates how to encode // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) - w.codegenEncoding.generate(w.codegenFreq, 7) + w.codegenEncoding.generate(w.codegenFreq[:], 7) numCodegens = len(w.codegenFreq) for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { numCodegens-- } dynamicHeader := int64(3+5+5+4+(3*numCodegens)) + - w.codegenEncoding.bitLength(w.codegenFreq) + + w.codegenEncoding.bitLength(w.codegenFreq[:]) + int64(extraBits) + int64(w.codegenFreq[16]*2) + int64(w.codegenFreq[17]*3) + @@ -482,7 +483,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []b // Generate codegen and codegenFrequencies, which indicates how to encode // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding) - w.codegenEncoding.generate(w.codegenFreq, 7) + w.codegenEncoding.generate(w.codegenFreq[:], 7) numCodegens := len(w.codegenFreq) for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { numCodegens-- @@ -609,13 +610,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { // Generate codegen and codegenFrequencies, which indicates how to encode // the literalEncoding and the offsetEncoding. w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset) - w.codegenEncoding.generate(w.codegenFreq, 7) + w.codegenEncoding.generate(w.codegenFreq[:], 7) numCodegens = len(w.codegenFreq) for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { numCodegens-- } headerSize := int64(3+5+5+4+(3*numCodegens)) + - w.codegenEncoding.bitLength(w.codegenFreq) + + w.codegenEncoding.bitLength(w.codegenFreq[:]) + int64(w.codegenFreq[16]*2) + int64(w.codegenFreq[17]*3) + int64(w.codegenFreq[18]*7) @@ -639,7 +640,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { // Huffman. w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) - encoding := w.literalEncoding.codes + encoding := w.literalEncoding.codes[:257] n := w.nbytes for _, t := range input { // Bitwriting inlined, ~30% speedup @@ -653,12 +654,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { bits := w.bits w.bits >>= 48 w.nbits -= 48 - w.bytes[n+0] = byte(bits) - w.bytes[n+1] = byte(bits >> 8) - w.bytes[n+2] = byte(bits >> 16) - w.bytes[n+3] = byte(bits >> 24) - w.bytes[n+4] = byte(bits >> 32) - w.bytes[n+5] = byte(bits >> 40) + bytes := w.bytes[n : n+6] + bytes[0] = byte(bits) + bytes[1] = byte(bits >> 8) + bytes[2] = byte(bits >> 16) + bytes[3] = byte(bits >> 24) + bytes[4] = byte(bits >> 32) + bytes[5] = byte(bits >> 40) n += 6 if n < bufferFlushSize { continue @@ -677,6 +679,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) { // // len(h) must be >= 256, and h's elements must be all zeroes. func histogram(b []byte, h []int32) { + h = h[:256] for _, t := range b { h[t]++ } diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go index b0328c6e08..20fb19090d 100644 --- a/src/compress/flate/huffman_code.go +++ b/src/compress/flate/huffman_code.go @@ -96,8 +96,8 @@ func generateFixedLiteralEncoding() *huffmanEncoder { func generateFixedOffsetEncoding() *huffmanEncoder { h := newHuffmanEncoder(30) codes := h.codes - for ch := uint16(0); ch < 30; ch++ { - codes[ch] = hcode{code: reverseBits(ch, 5), len: 5} + for ch := range codes { + codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5} } return h } -- 2.48.1