This encoding algorithm, which prioritizes speed over output size, is
based on Snappy's LZ77-style encoder: github.com/golang/snappy
This commit keeps the diff between this package's encodeBestSpeed
function and and Snappy's encodeBlock function as small as possible (see
the diff below). Follow-up commits will improve this package's
performance and output size.
This package's speed benchmarks:
name old speed new speed delta
EncodeDigitsSpeed1e4-8 40.7MB/s ± 0% 73.0MB/s ± 0% +79.18% (p=0.008 n=5+5)
EncodeDigitsSpeed1e5-8 33.0MB/s ± 0% 77.3MB/s ± 1% +134.04% (p=0.008 n=5+5)
EncodeDigitsSpeed1e6-8 32.1MB/s ± 0% 82.1MB/s ± 0% +156.18% (p=0.008 n=5+5)
EncodeTwainSpeed1e4-8 42.1MB/s ± 0% 65.0MB/s ± 0% +54.61% (p=0.008 n=5+5)
EncodeTwainSpeed1e5-8 46.3MB/s ± 0% 80.0MB/s ± 0% +72.81% (p=0.008 n=5+5)
EncodeTwainSpeed1e6-8 47.3MB/s ± 0% 81.7MB/s ± 0% +72.86% (p=0.008 n=5+5)
Here's the milliseconds taken, before and after this commit, to compress
a number of test files:
Go's src/compress/testdata files:
4 1 e.txt
8 4 Mark.Twain-Tom.Sawyer.txt
github.com/golang/snappy's benchmark files:
3 1 alice29.txt
12 3 asyoulik.txt
6 1 fireworks.jpeg
1 1 geo.protodata
1 0 html
2 2 html_x_4
6 3 kppkn.gtb
11 4 lcet10.txt
5 1 paper-100k.pdf
14 6 plrabn12.txt
17 6 urls.10K
Larger files linked to from
https://docs.google.com/spreadsheets/d/1VLxi-ac0BAtf735HyH3c1xRulbkYYUkFecKdLPH7NIQ/edit#gid=
166102500
2409 3182 adresser.001
16757 11027 enwik9
13764 12946 gob-stream
153978 74317 rawstudio-mint14.tar
4371 770 sharnd.out
Output size is larger. In the table below, the first column is the input
size, the second column is the output size prior to this commit, the
third column is the output size after this commit.
100003 47707 50006 e.txt
387851 172707 182930 Mark.Twain-Tom.Sawyer.txt
152089 62457 66705 alice29.txt
125179 54503 57274 asyoulik.txt
123093 122827 123108 fireworks.jpeg
118588 18574 20558 geo.protodata
102400 16601 17305 html
409600 65506 70313 html_x_4
184320 49007 50944 kppkn.gtb
426754 166957 179355 lcet10.txt
102400 82126 84937 paper-100k.pdf
481861 218617 231988 plrabn12.txt
702087 241774 258020 urls.10K
1073741824 43074110 57269781 adresser.001
1000000000 365772256 391052000 enwik9
1911399616 340364558 378679516 gob-stream
8558382592 3807229562 3972329193 rawstudio-mint14.tar
200000000 200061040 200015265 sharnd.out
The diff between github.com/golang/snappy's encodeBlock function and
this commit's encodeBestSpeed function:
1c1,7
< func encodeBlock(dst, src []byte) (d int) {
---
> func encodeBestSpeed(dst []token, src []byte) []token {
> // This check isn't in the Snappy implementation, but there, the caller
> // instead of the callee handles this case.
> if len(src) < minNonLiteralBlockSize {
> return emitLiteral(dst, src)
> }
>
4c10
< // and len(src) <= maxBlockSize and maxBlockSize == 65536.
---
> // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
65c71
< if load32(src, s) == load32(src, candidate) {
---
> if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) {
73c79
< d += emitLiteral(dst[d:], src[nextEmit:s])
---
> dst = emitLiteral(dst, src[nextEmit:s])
90c96
< // This is an inlined version of:
---
> // This is an inlined version of Snappy's:
93c99,103
< for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
---
> s1 := base + maxMatchLength
> if s1 > len(src) {
> s1 = len(src)
> }
> for i := candidate + 4; s < s1 && src[i] == src[s]; i, s = i+1, s+1 {
96c106,107
< d += emitCopy(dst[d:], base-candidate, s-base)
---
> // matchToken is flate's equivalent of Snappy's emitCopy.
> dst = append(dst, matchToken(uint32(s-base-3), uint32(base-candidate-minOffsetSize)))
114c125
< if uint32(x>>8) != load32(src, candidate) {
---
> if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) {
124c135
< d += emitLiteral(dst[d:], src[nextEmit:])
---
> dst = emitLiteral(dst, src[nextEmit:])
126c137
< return d
---
> return dst
This change is based on https://go-review.googlesource.com/#/c/21021/ by
Klaus Post, but it is a separate changelist as cl/21021 seems to have
stalled in code review, and the Go 1.7 feature freeze approaches.
Golang-dev discussion:
https://groups.google.com/d/topic/golang-dev/XYgHX9p8IOk/discussion and
of course cl/21021.
Change-Id: Ib662439417b3bd0b61c2977c12c658db3e44d164
Reviewed-on: https://go-review.googlesource.com/22370
Reviewed-by: Russ Cox <rsc@golang.org>
}
var levels = []compressionLevel{
- {}, // 0
- // For levels 1-3 we don't bother trying with lazy matches
- {1, 4, 0, 8, 4, 4},
+ {0, 0, 0, 0, 0, 0}, // NoCompression.
+ {1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go.
+ // For levels 2-3 we don't bother trying with lazy matches.
{2, 4, 0, 16, 8, 5},
{3, 4, 0, 32, 32, 6},
// Levels 4-9 use increasingly more lazy matching
// Should only be used after a reset.
func (d *compressor) fillWindow(b []byte) {
// Do not fill window if we are in store-only mode.
- if d.compressionLevel.level == 0 {
+ if d.compressionLevel.level < 2 {
return
}
if d.index != 0 || d.windowEnd != 0 {
return max
}
+// encSpeed will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) encSpeed() {
+ // We only compress if we have maxStoreBlockSize.
+ if d.windowEnd < maxStoreBlockSize {
+ if !d.sync {
+ return
+ }
+
+ // Handle small sizes.
+ if d.windowEnd < 128 {
+ switch {
+ case d.windowEnd == 0:
+ return
+ case d.windowEnd <= 16:
+ d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+ default:
+ d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+ d.err = d.w.err
+ }
+ d.windowEnd = 0
+ return
+ }
+
+ }
+ // Encode the block.
+ d.tokens = encodeBestSpeed(d.tokens[:0], d.window[:d.windowEnd])
+
+ // If we removed less than 1/16th, Huffman compress the block.
+ if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) {
+ d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+ } else {
+ d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd])
+ }
+ d.err = d.w.err
+ d.windowEnd = 0
+}
+
func (d *compressor) initDeflate() {
d.window = make([]byte, 2*windowSize)
d.hashOffset = 1
d.window = make([]byte, maxStoreBlockSize)
d.fill = (*compressor).fillStore
d.step = (*compressor).storeHuff
+ case level == BestSpeed:
+ d.compressionLevel = levels[level]
+ d.window = make([]byte, maxStoreBlockSize)
+ d.fill = (*compressor).fillStore
+ d.step = (*compressor).encSpeed
+ d.tokens = make([]token, maxStoreBlockSize)
case level == DefaultCompression:
level = 6
fallthrough
- case 1 <= level && level <= 9:
+ case 2 <= level && level <= 9:
d.compressionLevel = levels[level]
d.initDeflate()
d.fill = (*compressor).fillDeflate
switch d.compressionLevel.level {
case NoCompression:
d.windowEnd = 0
+ case BestSpeed:
+ d.windowEnd = 0
+ d.tokens = d.tokens[:0]
default:
d.chainHead = -1
for i := range d.hashHead {
{[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
[]byte{0, 8, 0, 247, 255, 17, 17, 17, 17, 17, 17, 17, 17, 1, 0, 0, 255, 255},
},
- {[]byte{}, 1, []byte{1, 0, 0, 255, 255}},
- {[]byte{0x11}, 1, []byte{18, 4, 4, 0, 0, 255, 255}},
- {[]byte{0x11, 0x12}, 1, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
- {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 1, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
+ {[]byte{}, 2, []byte{1, 0, 0, 255, 255}},
+ {[]byte{0x11}, 2, []byte{18, 4, 4, 0, 0, 255, 255}},
+ {[]byte{0x11, 0x12}, 2, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
+ {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 2, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
{[]byte{}, 9, []byte{1, 0, 0, 255, 255}},
{[]byte{0x11}, 9, []byte{18, 4, 4, 0, 0, 255, 255}},
{[]byte{0x11, 0x12}, 9, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
}
t.Logf("got %d bytes", len(out1))
}
+
+// TestBestSpeed tests that round-tripping through deflate and then inflate
+// recovers the original input. The Write sizes are near the thresholds in the
+// compressor.encSpeed method (0, 16, 128), as well as near maxStoreBlockSize
+// (65535).
+func TestBestSpeed(t *testing.T) {
+ abc := make([]byte, 128)
+ for i := range abc {
+ abc[i] = byte(i)
+ }
+ abcabc := bytes.Repeat(abc, 131072/len(abc))
+ var want []byte
+
+ testCases := [][]int{
+ {65536, 0},
+ {65536, 1},
+ {65536, 1, 256},
+ {65536, 1, 65536},
+ {65536, 14},
+ {65536, 15},
+ {65536, 16},
+ {65536, 16, 256},
+ {65536, 16, 65536},
+ {65536, 127},
+ {65536, 128},
+ {65536, 128, 256},
+ {65536, 128, 65536},
+ {65536, 129},
+ {65536, 65536, 256},
+ {65536, 65536, 65536},
+ }
+
+ for i, tc := range testCases {
+ for _, firstN := range []int{1, 65534, 65535, 65536, 65537, 131072} {
+ tc[0] = firstN
+ outer:
+ for _, flush := range []bool{false, true} {
+ buf := new(bytes.Buffer)
+ want = want[:0]
+
+ w, err := NewWriter(buf, BestSpeed)
+ if err != nil {
+ t.Errorf("i=%d, firstN=%d, flush=%t: NewWriter: %v", i, firstN, flush, err)
+ continue
+ }
+ for _, n := range tc {
+ want = append(want, abcabc[:n]...)
+ if _, err := w.Write(abcabc[:n]); err != nil {
+ t.Errorf("i=%d, firstN=%d, flush=%t: Write: %v", i, firstN, flush, err)
+ continue outer
+ }
+ if !flush {
+ continue
+ }
+ if err := w.Flush(); err != nil {
+ t.Errorf("i=%d, firstN=%d, flush=%t: Flush: %v", i, firstN, flush, err)
+ continue outer
+ }
+ }
+ if err := w.Close(); err != nil {
+ t.Errorf("i=%d, firstN=%d, flush=%t: Close: %v", i, firstN, flush, err)
+ continue
+ }
+
+ r := NewReader(buf)
+ got, err := ioutil.ReadAll(r)
+ if err != nil {
+ t.Errorf("i=%d, firstN=%d, flush=%t: ReadAll: %v", i, firstN, flush, err)
+ continue
+ }
+ r.Close()
+
+ if !bytes.Equal(got, want) {
+ t.Errorf("i=%d, firstN=%d, flush=%t: corruption during deflate-then-inflate", i, firstN, flush)
+ continue
+ }
+ }
+ }
+ }
+}
--- /dev/null
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// This encoding algorithm, which prioritizes speed over output size, is
+// based on Snappy's LZ77-style encoder: github.com/golang/snappy
+
+const maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset.
+
+func load32(b []byte, i int) uint32 {
+ b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+ return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+ b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+ return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+ uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func hash(u, shift uint32) uint32 {
+ return (u * 0x1e35a7bd) >> shift
+}
+
+// These constants are defined by the Snappy implementation so that its
+// assembly implementation can fast-path some 16-bytes-at-a-time copies. They
+// aren't necessary in the pure Go implementation, as we don't use those same
+// optimizations, but using the same thresholds doesn't really hurt.
+const (
+ inputMargin = 16 - 1
+ minNonLiteralBlockSize = 1 + 1 + inputMargin
+)
+
+func encodeBestSpeed(dst []token, src []byte) []token {
+ // This check isn't in the Snappy implementation, but there, the caller
+ // instead of the callee handles this case.
+ if len(src) < minNonLiteralBlockSize {
+ return emitLiteral(dst, src)
+ }
+
+ // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+ // The table element type is uint16, as s < sLimit and sLimit < len(src)
+ // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
+ const (
+ maxTableSize = 1 << 14
+ // tableMask is redundant, but helps the compiler eliminate bounds
+ // checks.
+ tableMask = maxTableSize - 1
+ )
+ shift := uint32(32 - 8)
+ for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+ shift--
+ }
+ // In Go, all array elements are zero-initialized, so there is no advantage
+ // to a smaller tableSize per se. However, it matches the C++ algorithm,
+ // and in the asm versions of this code, we can get away with zeroing only
+ // the first tableSize elements.
+ var table [maxTableSize]uint16
+
+ // sLimit is when to stop looking for offset/length copies. The inputMargin
+ // lets us use a fast path for emitLiteral in the main loop, while we are
+ // looking for copies.
+ sLimit := len(src) - inputMargin
+
+ // nextEmit is where in src the next emitLiteral should start from.
+ nextEmit := 0
+
+ // The encoded form must start with a literal, as there are no previous
+ // bytes to copy, so we start looking for hash matches at s == 1.
+ s := 1
+ nextHash := hash(load32(src, s), shift)
+
+ for {
+ // Copied from the C++ snappy implementation:
+ //
+ // Heuristic match skipping: If 32 bytes are scanned with no matches
+ // found, start looking only at every other byte. If 32 more bytes are
+ // scanned (or skipped), look at every third byte, etc.. When a match
+ // is found, immediately go back to looking at every byte. This is a
+ // small loss (~5% performance, ~0.1% density) for compressible data
+ // due to more bookkeeping, but for non-compressible data (such as
+ // JPEG) it's a huge win since the compressor quickly "realizes" the
+ // data is incompressible and doesn't bother looking for matches
+ // everywhere.
+ //
+ // The "skip" variable keeps track of how many bytes there are since
+ // the last match; dividing it by 32 (ie. right-shifting by five) gives
+ // the number of bytes to move ahead for each iteration.
+ skip := 32
+
+ nextS := s
+ candidate := 0
+ for {
+ s = nextS
+ bytesBetweenHashLookups := skip >> 5
+ nextS = s + bytesBetweenHashLookups
+ skip += bytesBetweenHashLookups
+ if nextS > sLimit {
+ goto emitRemainder
+ }
+ candidate = int(table[nextHash&tableMask])
+ table[nextHash&tableMask] = uint16(s)
+ nextHash = hash(load32(src, nextS), shift)
+ if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) {
+ break
+ }
+ }
+
+ // A 4-byte match has been found. We'll later see if more than 4 bytes
+ // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+ // them as literal bytes.
+ dst = emitLiteral(dst, src[nextEmit:s])
+
+ // Call emitCopy, and then see if another emitCopy could be our next
+ // move. Repeat until we find no match for the input immediately after
+ // what was consumed by the last emitCopy call.
+ //
+ // If we exit this loop normally then we need to call emitLiteral next,
+ // though we don't yet know how big the literal will be. We handle that
+ // by proceeding to the next iteration of the main loop. We also can
+ // exit this loop via goto if we get close to exhausting the input.
+ for {
+ // Invariant: we have a 4-byte match at s, and no need to emit any
+ // literal bytes prior to s.
+ base := s
+
+ // Extend the 4-byte match as long as possible.
+ //
+ // This is an inlined version of Snappy's:
+ // s = extendMatch(src, candidate+4, s+4)
+ s += 4
+ s1 := base + maxMatchLength
+ if s1 > len(src) {
+ s1 = len(src)
+ }
+ for i := candidate + 4; s < s1 && src[i] == src[s]; i, s = i+1, s+1 {
+ }
+
+ // matchToken is flate's equivalent of Snappy's emitCopy.
+ dst = append(dst, matchToken(uint32(s-base-3), uint32(base-candidate-minOffsetSize)))
+ nextEmit = s
+ if s >= sLimit {
+ goto emitRemainder
+ }
+
+ // We could immediately start working at s now, but to improve
+ // compression we first update the hash table at s-1 and at s. If
+ // another emitCopy is not our next move, also calculate nextHash
+ // at s+1. At least on GOARCH=amd64, these three hash calculations
+ // are faster as one load64 call (with some shifts) instead of
+ // three load32 calls.
+ x := load64(src, s-1)
+ prevHash := hash(uint32(x>>0), shift)
+ table[prevHash&tableMask] = uint16(s - 1)
+ currHash := hash(uint32(x>>8), shift)
+ candidate = int(table[currHash&tableMask])
+ table[currHash&tableMask] = uint16(s)
+ if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) {
+ nextHash = hash(uint32(x>>16), shift)
+ s++
+ break
+ }
+ }
+ }
+
+emitRemainder:
+ if nextEmit < len(src) {
+ dst = emitLiteral(dst, src[nextEmit:])
+ }
+ return dst
+}
+
+func emitLiteral(dst []token, lit []byte) []token {
+ for _, v := range lit {
+ dst = append(dst, token(v))
+ }
+ return dst
+}