compress/bzip2: support concatenated files

author Russ Cox <rsc@golang.org>

Mon, 5 Aug 2013 20:08:08 +0000 (16:08 -0400)

committer Russ Cox <rsc@golang.org>

Mon, 5 Aug 2013 20:08:08 +0000 (16:08 -0400)
author Russ Cox <rsc@golang.org>
Mon, 5 Aug 2013 20:08:08 +0000 (16:08 -0400)
committer Russ Cox <rsc@golang.org>
Mon, 5 Aug 2013 20:08:08 +0000 (16:08 -0400)
diff --git a/src/pkg/compress/bzip2/bzip2.go b/src/pkg/compress/bzip2/bzip2.go

index beeb8e169166c9988f49c048f7333aed4a186350..82e30c7c9d70d484265d7a69cc0e606cb8a4103d 100644 (file)
--- a/src/pkg/compress/bzip2/bzip2.go
+++ b/src/pkg/compress/bzip2/bzip2.go
@@ -22,14 +22,17 @@ func (s StructuralError) Error() string {
  
  // A reader decompresses bzip2 compressed data.
  type reader struct {
-       br        bitReader
-       setupDone bool // true if we have parsed the bzip2 header.
-       blockSize int  // blockSize in bytes, i.e. 900 * 1024.
-       eof       bool
-       buf       []byte    // stores Burrows-Wheeler transformed data.
-       c         [256]uint // the `C' array for the inverse BWT.
-       tt        []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
-       tPos      uint32    // Index of the next output byte in tt.
+       br           bitReader
+       fileCRC      uint32
+       blockCRC     uint32
+       wantBlockCRC uint32
+       setupDone    bool // true if we have parsed the bzip2 header.
+       blockSize    int  // blockSize in bytes, i.e. 900 * 1024.
+       eof          bool
+       buf          []byte    // stores Burrows-Wheeler transformed data.
+       c            [256]uint // the `C' array for the inverse BWT.
+       tt           []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
+       tPos         uint32    // Index of the next output byte in tt.
  
         preRLE      []uint32 // contains the RLE data still to be processed.
         preRLEUsed  int      // number of entries of preRLE used.
@@ -50,12 +53,14 @@ const bzip2BlockMagic = 0x314159265359
  const bzip2FinalMagic = 0x177245385090
  
  // setup parses the bzip2 header.
-func (bz2 *reader) setup() error {
+func (bz2 *reader) setup(needMagic bool) error {
         br := &bz2.br
  
-       magic := br.ReadBits(16)
-       if magic != bzip2FileMagic {
-               return StructuralError("bad magic value")
+       if needMagic {
+               magic := br.ReadBits(16)
+               if magic != bzip2FileMagic {
+                       return StructuralError("bad magic value")
+               }
         }
  
         t := br.ReadBits(8)
@@ -68,8 +73,11 @@ func (bz2 *reader) setup() error {
                 return StructuralError("invalid compression level")
         }
  
+       bz2.fileCRC = 0
         bz2.blockSize = 100 * 1024 * (int(level) - '0')
-       bz2.tt = make([]uint32, bz2.blockSize)
+       if bz2.blockSize > len(bz2.tt) {
+               bz2.tt = make([]uint32, bz2.blockSize)
+       }
         return nil
  }
  
@@ -79,7 +87,7 @@ func (bz2 *reader) Read(buf []byte) (n int, err error) {
         }
  
         if !bz2.setupDone {
-               err = bz2.setup()
+               err = bz2.setup(true)
                 brErr := bz2.br.Err()
                 if brErr != nil {
                         err = brErr
@@ -98,14 +106,14 @@ func (bz2 *reader) Read(buf []byte) (n int, err error) {
         return
  }
  
-func (bz2 *reader) read(buf []byte) (n int, err error) {
+func (bz2 *reader) readFromBlock(buf []byte) int {
         // bzip2 is a block based compressor, except that it has a run-length
         // preprocessing step. The block based nature means that we can
         // preallocate fixed-size buffers and reuse them. However, the RLE
         // preprocessing would require allocating huge buffers to store the
         // maximum expansion. Thus we process blocks all at once, except for
         // the RLE which we decompress as required.
-
+       n := 0
         for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
                 // We have RLE data pending.
  
@@ -148,34 +156,87 @@ func (bz2 *reader) read(buf []byte) (n int, err error) {
                 n++
         }
  
-       if n > 0 {
-               return
-       }
+       return n
+}
  
-       // No RLE data is pending so we need to read a block.
+func (bz2 *reader) read(buf []byte) (int, error) {
+       for {
+               n := bz2.readFromBlock(buf)
+               if n > 0 {
+                       bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
+                       return n, nil
+               }
  
-       br := &bz2.br
-       magic := br.ReadBits64(48)
-       if magic == bzip2FinalMagic {
-               br.ReadBits64(32) // ignored CRC
-               bz2.eof = true
-               return 0, io.EOF
-       } else if magic != bzip2BlockMagic {
-               return 0, StructuralError("bad magic value found")
-       }
+               // End of block. Check CRC.
+               if bz2.blockCRC != bz2.wantBlockCRC {
+                       bz2.br.err = StructuralError("block checksum mismatch")
+                       return 0, bz2.br.err
+               }
  
-       err = bz2.readBlock()
-       if err != nil {
-               return 0, err
-       }
+               // Find next block.
+               br := &bz2.br
+               switch br.ReadBits64(48) {
+               default:
+                       return 0, StructuralError("bad magic value found")
+
+               case bzip2BlockMagic:
+                       // Start of block.
+                       err := bz2.readBlock()
+                       if err != nil {
+                               return 0, err
+                       }
  
-       return bz2.read(buf)
+               case bzip2FinalMagic:
+                       // Check end-of-file CRC.
+                       wantFileCRC := uint32(br.ReadBits64(32))
+                       if br.err != nil {
+                               return 0, br.err
+                       }
+                       if bz2.fileCRC != wantFileCRC {
+                               br.err = StructuralError("file checksum mismatch")
+                               return 0, br.err
+                       }
+
+                       // Skip ahead to byte boundary.
+                       // Is there a file concatenated to this one?
+                       // It would start with BZ.
+                       if br.bits%8 != 0 {
+                               br.ReadBits(br.bits % 8)
+                       }
+                       b, err := br.r.ReadByte()
+                       if err == io.EOF {
+                               br.err = io.EOF
+                               bz2.eof = true
+                               return 0, io.EOF
+                       }
+                       if err != nil {
+                               br.err = err
+                               return 0, err
+                       }
+                       z, err := br.r.ReadByte()
+                       if err != nil {
+                               if err == io.EOF {
+                                       err = io.ErrUnexpectedEOF
+                               }
+                               br.err = err
+                               return 0, err
+                       }
+                       if b != 'B' || z != 'Z' {
+                               return 0, StructuralError("bad magic value in continuation file")
+                       }
+                       if err := bz2.setup(false); err != nil {
+                               return 0, err
+                       }
+               }
+       }
  }
  
  // readBlock reads a bzip2 block. The magic number should already have been consumed.
  func (bz2 *reader) readBlock() (err error) {
         br := &bz2.br
-       br.ReadBits64(32) // skip checksum. TODO: check it if we can figure out what it is.
+       bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
+       bz2.blockCRC = 0
+       bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
         randomized := br.ReadBits(1)
         if randomized != 0 {
                 return StructuralError("deprecated randomized files")
@@ -391,3 +452,33 @@ func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
  
         return tt[origPtr] >> 8
  }
+
+// This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
+// causing the bits in the input to be processed in the reverse of the usual order.
+
+var crctab [256]uint32
+
+func init() {
+       const poly = 0x04C11DB7
+       for i := range crctab {
+               crc := uint32(i) << 24
+               for j := 0; j < 8; j++ {
+                       if crc&0x80000000 != 0 {
+                               crc = (crc << 1) ^ poly
+                       } else {
+                               crc <<= 1
+                       }
+               }
+               crctab[i] = crc
+       }
+}
+
+// updateCRC updates the crc value to incorporate the data in b.
+// The initial value is 0.
+func updateCRC(val uint32, b []byte) uint32 {
+       crc := ^val
+       for _, v := range b {
+               crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
+       }
+       return ^crc
+}
diff --git a/src/pkg/compress/bzip2/bzip2_test.go b/src/pkg/compress/bzip2/bzip2_test.go

index 530719160c7c26944833e23e6c1ee47c2a24d401..ada1f9a001636ff977061007ae452350db047496 100644 (file)
--- a/src/pkg/compress/bzip2/bzip2_test.go
+++ b/src/pkg/compress/bzip2/bzip2_test.go
@@ -63,6 +63,19 @@ func TestHelloWorldBZ2(t *testing.T) {
         }
  }
  
+func TestConcat(t *testing.T) {
+       out, err := decompressHex(helloWorldBZ2Hex + helloWorldBZ2Hex)
+       if err != nil {
+               t.Errorf("error from Read: %s", err)
+               return
+       }
+
+       hello2 := bytes.Repeat(helloWorld, 2)
+       if !bytes.Equal(hello2, out) {
+               t.Errorf("got %x, want %x", out, hello2)
+       }
+}
+
  func testZeros(t *testing.T, inHex string, n int) {
         out, err := decompressHex(inHex)
         if err != nil {
diff --git a/src/pkg/compress/gzip/gzip_test.go b/src/pkg/compress/gzip/gzip_test.go

index 4d1af94381cab54ff234ef9e9d73e740c7179200..39afb331dafbb9afcbddceae2a71d6d325f6d766 100644 (file)
--- a/src/pkg/compress/gzip/gzip_test.go
+++ b/src/pkg/compress/gzip/gzip_test.go
@@ -197,3 +197,20 @@ func TestWriterFlush(t *testing.T) {
                 t.Fatal("Flush didn't flush any data")
         }
  }
+
+// Multiple gzip files concatenated form a valid gzip file.
+func TestConcat(t *testing.T) {
+       var buf bytes.Buffer
+       w := NewWriter(&buf)
+       w.Write([]byte("hello "))
+       w.Close()
+       w = NewWriter(&buf)
+       w.Write([]byte("world\n"))
+       w.Close()
+
+       r, err := NewReader(&buf)
+       data, err := ioutil.ReadAll(r)
+       if string(data) != "hello world\n" || err != nil {
+               t.Fatalf("ReadAll = %q, %v, want %q, nil", data, err, "hello world")
+       }
+}
author	Russ Cox <rsc@golang.org>
	Mon, 5 Aug 2013 20:08:08 +0000 (16:08 -0400)
committer	Russ Cox <rsc@golang.org>
	Mon, 5 Aug 2013 20:08:08 +0000 (16:08 -0400)
src/pkg/compress/bzip2/bzip2.go		patch \| blob \| history
src/pkg/compress/bzip2/bzip2_test.go		patch \| blob \| history
src/pkg/compress/gzip/gzip_test.go		patch \| blob \| history