From f20b1809f213c662932106a68c76ea3545eab1ee Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Sun, 10 Apr 2016 13:43:24 +0200
Subject: [PATCH] compress/flate: eliminate most common bounds checks
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This uses the SSA compiler to eliminate various unneeded bounds
checks in loops and various lookups.

This fixes the low hanging fruit, without any major code changes.

name                       old time/op    new time/op    delta
EncodeDigitsHuffman1e4-8     49.9Âµs Â± 1%    48.1Âµs Â± 1%  -3.74%   (p=0.000 n=10+9)
EncodeDigitsHuffman1e5-8      476Âµs Â± 1%     458Âµs Â± 1%  -3.58%  (p=0.000 n=10+10)
EncodeDigitsHuffman1e6-8     4.80ms Â± 2%    4.56ms Â± 1%  -5.07%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e4-8        305Âµs Â± 3%     290Âµs Â± 2%  -5.03%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e5-8       3.67ms Â± 2%    3.49ms Â± 2%  -4.78%   (p=0.000 n=9+10)
EncodeDigitsSpeed1e6-8       38.3ms Â± 2%    35.8ms Â± 1%  -6.58%   (p=0.000 n=9+10)
EncodeDigitsDefault1e4-8      361Âµs Â± 2%     346Âµs Â± 3%  -4.12%   (p=0.000 n=10+9)
EncodeDigitsDefault1e5-8     5.24ms Â± 2%    4.96ms Â± 3%  -5.38%  (p=0.000 n=10+10)
EncodeDigitsDefault1e6-8     56.5ms Â± 3%    52.2ms Â± 2%  -7.68%  (p=0.000 n=10+10)
EncodeDigitsCompress1e4-8     362Âµs Â± 2%     343Âµs Â± 1%  -5.20%   (p=0.000 n=10+9)
EncodeDigitsCompress1e5-8    5.26ms Â± 3%    4.98ms Â± 2%  -5.48%  (p=0.000 n=10+10)
EncodeDigitsCompress1e6-8    56.0ms Â± 4%    52.1ms Â± 1%  -7.01%  (p=0.000 n=10+10)
EncodeTwainHuffman1e4-8      70.9Âµs Â± 3%    64.7Âµs Â± 1%  -8.68%   (p=0.000 n=10+9)
EncodeTwainHuffman1e5-8       556Âµs Â± 2%     524Âµs Â± 2%  -5.84%  (p=0.000 n=10+10)
EncodeTwainHuffman1e6-8      5.54ms Â± 3%    5.22ms Â± 2%  -5.70%  (p=0.000 n=10+10)
EncodeTwainSpeed1e4-8         294Âµs Â± 3%     284Âµs Â± 1%  -3.71%  (p=0.000 n=10+10)
EncodeTwainSpeed1e5-8        2.59ms Â± 2%    2.48ms Â± 1%  -4.14%   (p=0.000 n=10+9)
EncodeTwainSpeed1e6-8        25.6ms Â± 1%    24.3ms Â± 1%  -5.28%   (p=0.000 n=9+10)
EncodeTwainDefault1e4-8       419Âµs Â± 2%     396Âµs Â± 1%  -5.59%   (p=0.000 n=10+9)
EncodeTwainDefault1e5-8      6.23ms Â± 4%    5.75ms Â± 1%  -7.83%   (p=0.000 n=10+9)
EncodeTwainDefault1e6-8      66.2ms Â± 2%    61.4ms Â± 1%  -7.22%  (p=0.000 n=10+10)
EncodeTwainCompress1e4-8      426Âµs Â± 1%     405Âµs Â± 1%  -4.97%   (p=0.000 n=9+10)
EncodeTwainCompress1e5-8     6.80ms Â± 1%    6.32ms Â± 1%  -6.97%   (p=0.000 n=9+10)
EncodeTwainCompress1e6-8     74.6ms Â± 3%    68.7ms Â± 1%  -7.90%   (p=0.000 n=10+9)

name                       old speed      new speed      delta
EncodeDigitsHuffman1e4-8    200MB/s Â± 1%   208MB/s Â± 1%  +3.88%   (p=0.000 n=10+9)
EncodeDigitsHuffman1e5-8    210MB/s Â± 1%   218MB/s Â± 1%  +3.71%  (p=0.000 n=10+10)
EncodeDigitsHuffman1e6-8    208MB/s Â± 2%   219MB/s Â± 1%  +5.32%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e4-8     32.8MB/s Â± 3%  34.5MB/s Â± 2%  +5.29%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e5-8     27.2MB/s Â± 2%  28.6MB/s Â± 2%  +5.29%  (p=0.000 n=10+10)
EncodeDigitsSpeed1e6-8     26.1MB/s Â± 2%  27.9MB/s Â± 1%  +7.02%   (p=0.000 n=9+10)
EncodeDigitsDefault1e4-8   27.7MB/s Â± 2%  28.9MB/s Â± 3%  +4.30%   (p=0.000 n=10+9)
EncodeDigitsDefault1e5-8   19.1MB/s Â± 2%  20.2MB/s Â± 3%  +5.69%  (p=0.000 n=10+10)
EncodeDigitsDefault1e6-8   17.7MB/s Â± 3%  19.2MB/s Â± 2%  +8.31%  (p=0.000 n=10+10)
EncodeDigitsCompress1e4-8  27.6MB/s Â± 2%  29.1MB/s Â± 1%  +5.47%   (p=0.000 n=10+9)
EncodeDigitsCompress1e5-8  19.0MB/s Â± 3%  20.1MB/s Â± 2%  +5.78%  (p=0.000 n=10+10)
EncodeDigitsCompress1e6-8  17.9MB/s Â± 4%  19.2MB/s Â± 1%  +7.50%  (p=0.000 n=10+10)
EncodeTwainHuffman1e4-8     141MB/s Â± 3%   154MB/s Â± 1%  +9.46%   (p=0.000 n=10+9)
EncodeTwainHuffman1e5-8     180MB/s Â± 2%   191MB/s Â± 2%  +6.19%  (p=0.000 n=10+10)
EncodeTwainHuffman1e6-8     181MB/s Â± 3%   192MB/s Â± 2%  +6.02%  (p=0.000 n=10+10)
EncodeTwainSpeed1e4-8      34.0MB/s Â± 3%  35.3MB/s Â± 1%  +3.84%  (p=0.000 n=10+10)
EncodeTwainSpeed1e5-8      38.7MB/s Â± 2%  40.3MB/s Â± 1%  +4.30%   (p=0.000 n=10+9)
EncodeTwainSpeed1e6-8      39.1MB/s Â± 1%  41.2MB/s Â± 1%  +5.57%   (p=0.000 n=9+10)
EncodeTwainDefault1e4-8    23.9MB/s Â± 2%  25.3MB/s Â± 1%  +5.91%   (p=0.000 n=10+9)
EncodeTwainDefault1e5-8    16.0MB/s Â± 4%  17.4MB/s Â± 1%  +8.47%   (p=0.000 n=10+9)
EncodeTwainDefault1e6-8    15.1MB/s Â± 2%  16.3MB/s Â± 1%  +7.76%  (p=0.000 n=10+10)
EncodeTwainCompress1e4-8   23.5MB/s Â± 1%  24.7MB/s Â± 1%  +5.24%   (p=0.000 n=9+10)
EncodeTwainCompress1e5-8   14.7MB/s Â± 1%  15.8MB/s Â± 1%  +7.50%   (p=0.000 n=9+10)
EncodeTwainCompress1e6-8   13.4MB/s Â± 3%  14.6MB/s Â± 1%  +8.57%   (p=0.000 n=10+9)

Change-Id: I5c7e84c2f9ea4d38a2115995705eebb93387e22f
Reviewed-on: https://go-review.googlesource.com/21759
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/compress/flate/deflate.go            | 37 ++++++++--------
 src/compress/flate/huffman_bit_writer.go | 55 +++++++++++++-----------
 src/compress/flate/huffman_code.go       |  4 +-
 3 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go
index 3bb8b5e02a..d8bbffbc66 100644
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -73,8 +73,8 @@ type compressor struct {
 	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
 	// with the same hash value.
 	chainHead  int
-	hashHead   []uint32
-	hashPrev   []uint32
+	hashHead   [hashSize]uint32
+	hashPrev   [windowSize]uint32
 	hashOffset int
 
 	// input window: unprocessed data is window[index:windowEnd]
@@ -188,12 +188,13 @@ func (d *compressor) fillWindow(b []byte) {
 		var newH uint32
 		for i, val := range dst {
 			di := i + index
-			newH = val & hashMask
+			newH = val
+			hh := &d.hashHead[newH&hashMask]
 			// Get previous value with the same hash.
 			// Our chain should point to the previous value.
-			d.hashPrev[di&windowMask] = d.hashHead[newH]
+			d.hashPrev[di&windowMask] = *hh
 			// Set the head of the hash chain to us.
-			d.hashHead[newH] = uint32(di + d.hashOffset)
+			*hh = uint32(di + d.hashOffset)
 		}
 		d.hash = newH
 	}
@@ -293,6 +294,7 @@ func bulkHash4(b []byte, dst []uint32) {
 // bytes in size.
 func matchLen(a, b []byte, max int) int {
 	a = a[:max]
+	b = b[:len(a)]
 	for i, av := range a {
 		if b[i] != av {
 			return i
@@ -302,8 +304,6 @@ func matchLen(a, b []byte, max int) int {
 }
 
 func (d *compressor) initDeflate() {
-	d.hashHead = make([]uint32, hashSize)
-	d.hashPrev = make([]uint32, windowSize)
 	d.window = make([]byte, 2*windowSize)
 	d.hashOffset = 1
 	d.tokens = make([]token, 0, maxFlateBlockTokens+1)
@@ -358,9 +358,10 @@ Loop:
 		if d.index < d.maxInsertIndex {
 			// Update the hash
 			d.hash = hash4(d.window[d.index : d.index+minMatchLength])
-			d.chainHead = int(d.hashHead[d.hash])
+			hh := &d.hashHead[d.hash&hashMask]
+			d.chainHead = int(*hh)
 			d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
-			d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
+			*hh = uint32(d.index + d.hashOffset)
 		}
 		prevLength := d.length
 		prevOffset := d.offset
@@ -404,9 +405,10 @@ Loop:
 						d.hash = hash4(d.window[d.index : d.index+minMatchLength])
 						// Get previous value with the same hash.
 						// Our chain should point to the previous value.
-						d.hashPrev[d.index&windowMask] = d.hashHead[d.hash]
+						hh := &d.hashHead[d.hash&hashMask]
+						d.hashPrev[d.index&windowMask] = *hh
 						// Set the head of the hash chain to us.
-						d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
+						*hh = uint32(d.index + d.hashOffset)
 					}
 				}
 				if d.fastSkipHashing == skipNever {
@@ -531,9 +533,6 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 	return nil
 }
 
-// hzeroes is used for zeroing the hash slice.
-var hzeroes [256]uint32
-
 func (d *compressor) reset(w io.Writer) {
 	d.w.reset(w)
 	d.sync = false
@@ -543,15 +542,13 @@ func (d *compressor) reset(w io.Writer) {
 		d.windowEnd = 0
 	default:
 		d.chainHead = -1
-		for s := d.hashHead; len(s) > 0; {
-			n := copy(s, hzeroes[:])
-			s = s[n:]
+		for i := range d.hashHead {
+			d.hashHead[i] = 0
 		}
-		for s := d.hashPrev; len(s) > 0; s = s[len(hzeroes):] {
-			copy(s, hzeroes[:])
+		for i := range d.hashPrev {
+			d.hashPrev[i] = 0
 		}
 		d.hashOffset = 1
-
 		d.index, d.windowEnd = 0, 0
 		d.blockStart, d.byteAvailable = 0, false
 		d.tokens = d.tokens[:0]
diff --git a/src/compress/flate/huffman_bit_writer.go b/src/compress/flate/huffman_bit_writer.go
index b99f86ea13..23f242f88e 100644
--- a/src/compress/flate/huffman_bit_writer.go
+++ b/src/compress/flate/huffman_bit_writer.go
@@ -84,11 +84,11 @@ type huffmanBitWriter struct {
 	bits            uint64
 	nbits           uint
 	bytes           [bufferSize]byte
+	codegenFreq     [codegenCodeCount]int32
 	nbytes          int
 	literalFreq     []int32
 	offsetFreq      []int32
 	codegen         []uint8
-	codegenFreq     []int32
 	literalEncoding *huffmanEncoder
 	offsetEncoding  *huffmanEncoder
 	codegenEncoding *huffmanEncoder
@@ -101,7 +101,6 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 		literalFreq:     make([]int32, maxNumLit),
 		offsetFreq:      make([]int32, offsetCodeCount),
 		codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
-		codegenFreq:     make([]int32, codegenCodeCount),
 		literalEncoding: newHuffmanEncoder(maxNumLit),
 		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
@@ -143,12 +142,13 @@ func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
 		w.bits >>= 48
 		w.nbits -= 48
 		n := w.nbytes
-		w.bytes[n+0] = byte(bits)
-		w.bytes[n+1] = byte(bits >> 8)
-		w.bytes[n+2] = byte(bits >> 16)
-		w.bytes[n+3] = byte(bits >> 24)
-		w.bytes[n+4] = byte(bits >> 32)
-		w.bytes[n+5] = byte(bits >> 40)
+		bytes := w.bytes[n : n+6]
+		bytes[0] = byte(bits)
+		bytes[1] = byte(bits >> 8)
+		bytes[2] = byte(bits >> 16)
+		bytes[3] = byte(bits >> 24)
+		bytes[4] = byte(bits >> 32)
+		bytes[5] = byte(bits >> 40)
 		n += 6
 		if n >= bufferFlushSize {
 			_, w.err = w.w.Write(w.bytes[:n])
@@ -293,12 +293,13 @@ func (w *huffmanBitWriter) writeCode(c hcode) {
 		w.bits >>= 48
 		w.nbits -= 48
 		n := w.nbytes
-		w.bytes[n+0] = byte(bits)
-		w.bytes[n+1] = byte(bits >> 8)
-		w.bytes[n+2] = byte(bits >> 16)
-		w.bytes[n+3] = byte(bits >> 24)
-		w.bytes[n+4] = byte(bits >> 32)
-		w.bytes[n+5] = byte(bits >> 40)
+		bytes := w.bytes[n : n+6]
+		bytes[0] = byte(bits)
+		bytes[1] = byte(bits >> 8)
+		bytes[2] = byte(bits >> 16)
+		bytes[3] = byte(bits >> 24)
+		bytes[4] = byte(bits >> 32)
+		bytes[5] = byte(bits >> 40)
 		n += 6
 		if n >= bufferFlushSize {
 			_, w.err = w.w.Write(w.bytes[:n])
@@ -428,13 +429,13 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 	// Generate codegen and codegenFrequencies, which indicates how to encode
 	// the literalEncoding and the offsetEncoding.
 	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
-	w.codegenEncoding.generate(w.codegenFreq, 7)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
 	numCodegens = len(w.codegenFreq)
 	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 		numCodegens--
 	}
 	dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
-		w.codegenEncoding.bitLength(w.codegenFreq) +
+		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
 		int64(extraBits) +
 		int64(w.codegenFreq[16]*2) +
 		int64(w.codegenFreq[17]*3) +
@@ -482,7 +483,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []b
 	// Generate codegen and codegenFrequencies, which indicates how to encode
 	// the literalEncoding and the offsetEncoding.
 	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
-	w.codegenEncoding.generate(w.codegenFreq, 7)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
 	numCodegens := len(w.codegenFreq)
 	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 		numCodegens--
@@ -609,13 +610,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
 	// Generate codegen and codegenFrequencies, which indicates how to encode
 	// the literalEncoding and the offsetEncoding.
 	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
-	w.codegenEncoding.generate(w.codegenFreq, 7)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
 	numCodegens = len(w.codegenFreq)
 	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 		numCodegens--
 	}
 	headerSize := int64(3+5+5+4+(3*numCodegens)) +
-		w.codegenEncoding.bitLength(w.codegenFreq) +
+		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
 		int64(w.codegenFreq[16]*2) +
 		int64(w.codegenFreq[17]*3) +
 		int64(w.codegenFreq[18]*7)
@@ -639,7 +640,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
 
 	// Huffman.
 	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-	encoding := w.literalEncoding.codes
+	encoding := w.literalEncoding.codes[:257]
 	n := w.nbytes
 	for _, t := range input {
 		// Bitwriting inlined, ~30% speedup
@@ -653,12 +654,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
 		bits := w.bits
 		w.bits >>= 48
 		w.nbits -= 48
-		w.bytes[n+0] = byte(bits)
-		w.bytes[n+1] = byte(bits >> 8)
-		w.bytes[n+2] = byte(bits >> 16)
-		w.bytes[n+3] = byte(bits >> 24)
-		w.bytes[n+4] = byte(bits >> 32)
-		w.bytes[n+5] = byte(bits >> 40)
+		bytes := w.bytes[n : n+6]
+		bytes[0] = byte(bits)
+		bytes[1] = byte(bits >> 8)
+		bytes[2] = byte(bits >> 16)
+		bytes[3] = byte(bits >> 24)
+		bytes[4] = byte(bits >> 32)
+		bytes[5] = byte(bits >> 40)
 		n += 6
 		if n < bufferFlushSize {
 			continue
@@ -677,6 +679,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
 //
 // len(h) must be >= 256, and h's elements must be all zeroes.
 func histogram(b []byte, h []int32) {
+	h = h[:256]
 	for _, t := range b {
 		h[t]++
 	}
diff --git a/src/compress/flate/huffman_code.go b/src/compress/flate/huffman_code.go
index b0328c6e08..20fb19090d 100644
--- a/src/compress/flate/huffman_code.go
+++ b/src/compress/flate/huffman_code.go
@@ -96,8 +96,8 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 func generateFixedOffsetEncoding() *huffmanEncoder {
 	h := newHuffmanEncoder(30)
 	codes := h.codes
-	for ch := uint16(0); ch < 30; ch++ {
-		codes[ch] = hcode{code: reverseBits(ch, 5), len: 5}
+	for ch := range codes {
+		codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
 	}
 	return h
 }
-- 
2.52.0