compress/flate: replace "Best Speed" with specialized version

author Nigel Tao <nigeltao@golang.org>

Tue, 26 Apr 2016 09:16:30 +0000 (19:16 +1000)

committer Nigel Tao <nigeltao@golang.org>

Fri, 29 Apr 2016 06:58:42 +0000 (06:58 +0000)
author Nigel Tao <nigeltao@golang.org>
Tue, 26 Apr 2016 09:16:30 +0000 (19:16 +1000)
committer Nigel Tao <nigeltao@golang.org>
Fri, 29 Apr 2016 06:58:42 +0000 (06:58 +0000)
diff --git a/src/compress/flate/deflate.go b/src/compress/flate/deflate.go

index d8bbffbc66c366d07d2b0ff86e417f4afc032ef2..f7ba02fe4ee8bfdbfb7cb0ab789e916e1e53a6a3 100644 (file)
--- a/src/compress/flate/deflate.go
+++ b/src/compress/flate/deflate.go
@@ -41,9 +41,9 @@ type compressionLevel struct {
  }
  
  var levels = []compressionLevel{
-       {}, // 0
-       // For levels 1-3 we don't bother trying with lazy matches
-       {1, 4, 0, 8, 4, 4},
+       {0, 0, 0, 0, 0, 0}, // NoCompression.
+       {1, 0, 0, 0, 0, 0}, // BestSpeed uses a custom algorithm; see deflatefast.go.
+       // For levels 2-3 we don't bother trying with lazy matches.
         {2, 4, 0, 16, 8, 5},
         {3, 4, 0, 32, 32, 6},
         // Levels 4-9 use increasingly more lazy matching
@@ -154,7 +154,7 @@ func (d *compressor) writeBlock(tokens []token, index int) error {
  // Should only be used after a reset.
  func (d *compressor) fillWindow(b []byte) {
         // Do not fill window if we are in store-only mode.
-       if d.compressionLevel.level == 0 {
+       if d.compressionLevel.level < 2 {
                 return
         }
         if d.index != 0 || d.windowEnd != 0 {
@@ -303,6 +303,45 @@ func matchLen(a, b []byte, max int) int {
         return max
  }
  
+// encSpeed will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) encSpeed() {
+       // We only compress if we have maxStoreBlockSize.
+       if d.windowEnd < maxStoreBlockSize {
+               if !d.sync {
+                       return
+               }
+
+               // Handle small sizes.
+               if d.windowEnd < 128 {
+                       switch {
+                       case d.windowEnd == 0:
+                               return
+                       case d.windowEnd <= 16:
+                               d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+                       default:
+                               d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+                               d.err = d.w.err
+                       }
+                       d.windowEnd = 0
+                       return
+               }
+
+       }
+       // Encode the block.
+       d.tokens = encodeBestSpeed(d.tokens[:0], d.window[:d.windowEnd])
+
+       // If we removed less than 1/16th, Huffman compress the block.
+       if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) {
+               d.w.writeBlockHuff(false, d.window[:d.windowEnd])
+       } else {
+               d.w.writeBlockDynamic(d.tokens, false, d.window[:d.windowEnd])
+       }
+       d.err = d.w.err
+       d.windowEnd = 0
+}
+
  func (d *compressor) initDeflate() {
         d.window = make([]byte, 2*windowSize)
         d.hashOffset = 1
@@ -519,10 +558,16 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
                 d.window = make([]byte, maxStoreBlockSize)
                 d.fill = (*compressor).fillStore
                 d.step = (*compressor).storeHuff
+       case level == BestSpeed:
+               d.compressionLevel = levels[level]
+               d.window = make([]byte, maxStoreBlockSize)
+               d.fill = (*compressor).fillStore
+               d.step = (*compressor).encSpeed
+               d.tokens = make([]token, maxStoreBlockSize)
         case level == DefaultCompression:
                 level = 6
                 fallthrough
-       case 1 <= level && level <= 9:
+       case 2 <= level && level <= 9:
                 d.compressionLevel = levels[level]
                 d.initDeflate()
                 d.fill = (*compressor).fillDeflate
@@ -540,6 +585,9 @@ func (d *compressor) reset(w io.Writer) {
         switch d.compressionLevel.level {
         case NoCompression:
                 d.windowEnd = 0
+       case BestSpeed:
+               d.windowEnd = 0
+               d.tokens = d.tokens[:0]
         default:
                 d.chainHead = -1
                 for i := range d.hashHead {
diff --git a/src/compress/flate/deflate_test.go b/src/compress/flate/deflate_test.go

index 42208cba57ab3c3360e961f5c0df30381e032d58..27a3b3823a649401df3c58f8be497328feb6e7a5 100644 (file)
--- a/src/compress/flate/deflate_test.go
+++ b/src/compress/flate/deflate_test.go
@@ -42,10 +42,10 @@ var deflateTests = []*deflateTest{
         {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 0,
                 []byte{0, 8, 0, 247, 255, 17, 17, 17, 17, 17, 17, 17, 17, 1, 0, 0, 255, 255},
         },
-       {[]byte{}, 1, []byte{1, 0, 0, 255, 255}},
-       {[]byte{0x11}, 1, []byte{18, 4, 4, 0, 0, 255, 255}},
-       {[]byte{0x11, 0x12}, 1, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
-       {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 1, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
+       {[]byte{}, 2, []byte{1, 0, 0, 255, 255}},
+       {[]byte{0x11}, 2, []byte{18, 4, 4, 0, 0, 255, 255}},
+       {[]byte{0x11, 0x12}, 2, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
+       {[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11}, 2, []byte{18, 132, 2, 64, 0, 0, 0, 255, 255}},
         {[]byte{}, 9, []byte{1, 0, 0, 255, 255}},
         {[]byte{0x11}, 9, []byte{18, 4, 4, 0, 0, 255, 255}},
         {[]byte{0x11, 0x12}, 9, []byte{18, 20, 2, 4, 0, 0, 255, 255}},
@@ -551,3 +551,83 @@ func testResetOutput(t *testing.T, newWriter func(w io.Writer) (*Writer, error))
         }
         t.Logf("got %d bytes", len(out1))
  }
+
+// TestBestSpeed tests that round-tripping through deflate and then inflate
+// recovers the original input. The Write sizes are near the thresholds in the
+// compressor.encSpeed method (0, 16, 128), as well as near maxStoreBlockSize
+// (65535).
+func TestBestSpeed(t *testing.T) {
+       abc := make([]byte, 128)
+       for i := range abc {
+               abc[i] = byte(i)
+       }
+       abcabc := bytes.Repeat(abc, 131072/len(abc))
+       var want []byte
+
+       testCases := [][]int{
+               {65536, 0},
+               {65536, 1},
+               {65536, 1, 256},
+               {65536, 1, 65536},
+               {65536, 14},
+               {65536, 15},
+               {65536, 16},
+               {65536, 16, 256},
+               {65536, 16, 65536},
+               {65536, 127},
+               {65536, 128},
+               {65536, 128, 256},
+               {65536, 128, 65536},
+               {65536, 129},
+               {65536, 65536, 256},
+               {65536, 65536, 65536},
+       }
+
+       for i, tc := range testCases {
+               for _, firstN := range []int{1, 65534, 65535, 65536, 65537, 131072} {
+                       tc[0] = firstN
+               outer:
+                       for _, flush := range []bool{false, true} {
+                               buf := new(bytes.Buffer)
+                               want = want[:0]
+
+                               w, err := NewWriter(buf, BestSpeed)
+                               if err != nil {
+                                       t.Errorf("i=%d, firstN=%d, flush=%t: NewWriter: %v", i, firstN, flush, err)
+                                       continue
+                               }
+                               for _, n := range tc {
+                                       want = append(want, abcabc[:n]...)
+                                       if _, err := w.Write(abcabc[:n]); err != nil {
+                                               t.Errorf("i=%d, firstN=%d, flush=%t: Write: %v", i, firstN, flush, err)
+                                               continue outer
+                                       }
+                                       if !flush {
+                                               continue
+                                       }
+                                       if err := w.Flush(); err != nil {
+                                               t.Errorf("i=%d, firstN=%d, flush=%t: Flush: %v", i, firstN, flush, err)
+                                               continue outer
+                                       }
+                               }
+                               if err := w.Close(); err != nil {
+                                       t.Errorf("i=%d, firstN=%d, flush=%t: Close: %v", i, firstN, flush, err)
+                                       continue
+                               }
+
+                               r := NewReader(buf)
+                               got, err := ioutil.ReadAll(r)
+                               if err != nil {
+                                       t.Errorf("i=%d, firstN=%d, flush=%t: ReadAll: %v", i, firstN, flush, err)
+                                       continue
+                               }
+                               r.Close()
+
+                               if !bytes.Equal(got, want) {
+                                       t.Errorf("i=%d, firstN=%d, flush=%t: corruption during deflate-then-inflate", i, firstN, flush)
+                                       continue
+                               }
+                       }
+               }
+       }
+}
diff --git a/src/compress/flate/deflatefast.go b/src/compress/flate/deflatefast.go

new file mode 100644 (file)

index 0000000..ddf4f56
--- /dev/null
+++ b/src/compress/flate/deflatefast.go
@@ -0,0 +1,180 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// This encoding algorithm, which prioritizes speed over output size, is
+// based on Snappy's LZ77-style encoder: github.com/golang/snappy
+
+const maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset.
+
+func load32(b []byte, i int) uint32 {
+       b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+       return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+       b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+       return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+               uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func hash(u, shift uint32) uint32 {
+       return (u * 0x1e35a7bd) >> shift
+}
+
+// These constants are defined by the Snappy implementation so that its
+// assembly implementation can fast-path some 16-bytes-at-a-time copies. They
+// aren't necessary in the pure Go implementation, as we don't use those same
+// optimizations, but using the same thresholds doesn't really hurt.
+const (
+       inputMargin            = 16 - 1
+       minNonLiteralBlockSize = 1 + 1 + inputMargin
+)
+
+func encodeBestSpeed(dst []token, src []byte) []token {
+       // This check isn't in the Snappy implementation, but there, the caller
+       // instead of the callee handles this case.
+       if len(src) < minNonLiteralBlockSize {
+               return emitLiteral(dst, src)
+       }
+
+       // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+       // The table element type is uint16, as s < sLimit and sLimit < len(src)
+       // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
+       const (
+               maxTableSize = 1 << 14
+               // tableMask is redundant, but helps the compiler eliminate bounds
+               // checks.
+               tableMask = maxTableSize - 1
+       )
+       shift := uint32(32 - 8)
+       for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+               shift--
+       }
+       // In Go, all array elements are zero-initialized, so there is no advantage
+       // to a smaller tableSize per se. However, it matches the C++ algorithm,
+       // and in the asm versions of this code, we can get away with zeroing only
+       // the first tableSize elements.
+       var table [maxTableSize]uint16
+
+       // sLimit is when to stop looking for offset/length copies. The inputMargin
+       // lets us use a fast path for emitLiteral in the main loop, while we are
+       // looking for copies.
+       sLimit := len(src) - inputMargin
+
+       // nextEmit is where in src the next emitLiteral should start from.
+       nextEmit := 0
+
+       // The encoded form must start with a literal, as there are no previous
+       // bytes to copy, so we start looking for hash matches at s == 1.
+       s := 1
+       nextHash := hash(load32(src, s), shift)
+
+       for {
+               // Copied from the C++ snappy implementation:
+               //
+               // Heuristic match skipping: If 32 bytes are scanned with no matches
+               // found, start looking only at every other byte. If 32 more bytes are
+               // scanned (or skipped), look at every third byte, etc.. When a match
+               // is found, immediately go back to looking at every byte. This is a
+               // small loss (~5% performance, ~0.1% density) for compressible data
+               // due to more bookkeeping, but for non-compressible data (such as
+               // JPEG) it's a huge win since the compressor quickly "realizes" the
+               // data is incompressible and doesn't bother looking for matches
+               // everywhere.
+               //
+               // The "skip" variable keeps track of how many bytes there are since
+               // the last match; dividing it by 32 (ie. right-shifting by five) gives
+               // the number of bytes to move ahead for each iteration.
+               skip := 32
+
+               nextS := s
+               candidate := 0
+               for {
+                       s = nextS
+                       bytesBetweenHashLookups := skip >> 5
+                       nextS = s + bytesBetweenHashLookups
+                       skip += bytesBetweenHashLookups
+                       if nextS > sLimit {
+                               goto emitRemainder
+                       }
+                       candidate = int(table[nextHash&tableMask])
+                       table[nextHash&tableMask] = uint16(s)
+                       nextHash = hash(load32(src, nextS), shift)
+                       if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) {
+                               break
+                       }
+               }
+
+               // A 4-byte match has been found. We'll later see if more than 4 bytes
+               // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+               // them as literal bytes.
+               dst = emitLiteral(dst, src[nextEmit:s])
+
+               // Call emitCopy, and then see if another emitCopy could be our next
+               // move. Repeat until we find no match for the input immediately after
+               // what was consumed by the last emitCopy call.
+               //
+               // If we exit this loop normally then we need to call emitLiteral next,
+               // though we don't yet know how big the literal will be. We handle that
+               // by proceeding to the next iteration of the main loop. We also can
+               // exit this loop via goto if we get close to exhausting the input.
+               for {
+                       // Invariant: we have a 4-byte match at s, and no need to emit any
+                       // literal bytes prior to s.
+                       base := s
+
+                       // Extend the 4-byte match as long as possible.
+                       //
+                       // This is an inlined version of Snappy's:
+                       //      s = extendMatch(src, candidate+4, s+4)
+                       s += 4
+                       s1 := base + maxMatchLength
+                       if s1 > len(src) {
+                               s1 = len(src)
+                       }
+                       for i := candidate + 4; s < s1 && src[i] == src[s]; i, s = i+1, s+1 {
+                       }
+
+                       // matchToken is flate's equivalent of Snappy's emitCopy.
+                       dst = append(dst, matchToken(uint32(s-base-3), uint32(base-candidate-minOffsetSize)))
+                       nextEmit = s
+                       if s >= sLimit {
+                               goto emitRemainder
+                       }
+
+                       // We could immediately start working at s now, but to improve
+                       // compression we first update the hash table at s-1 and at s. If
+                       // another emitCopy is not our next move, also calculate nextHash
+                       // at s+1. At least on GOARCH=amd64, these three hash calculations
+                       // are faster as one load64 call (with some shifts) instead of
+                       // three load32 calls.
+                       x := load64(src, s-1)
+                       prevHash := hash(uint32(x>>0), shift)
+                       table[prevHash&tableMask] = uint16(s - 1)
+                       currHash := hash(uint32(x>>8), shift)
+                       candidate = int(table[currHash&tableMask])
+                       table[currHash&tableMask] = uint16(s)
+                       if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) {
+                               nextHash = hash(uint32(x>>16), shift)
+                               s++
+                               break
+                       }
+               }
+       }
+
+emitRemainder:
+       if nextEmit < len(src) {
+               dst = emitLiteral(dst, src[nextEmit:])
+       }
+       return dst
+}
+
+func emitLiteral(dst []token, lit []byte) []token {
+       for _, v := range lit {
+               dst = append(dst, token(v))
+       }
+       return dst
+}
author	Nigel Tao <nigeltao@golang.org>
	Tue, 26 Apr 2016 09:16:30 +0000 (19:16 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Fri, 29 Apr 2016 06:58:42 +0000 (06:58 +0000)
src/compress/flate/deflate.go		patch \| blob \| history
src/compress/flate/deflate_test.go		patch \| blob \| history
src/compress/flate/deflatefast.go	[new file with mode: 0644]	patch \| blob