image/jpeg: speed up decoding by inlining the clip function and

author Nigel Tao <nigeltao@golang.org>

Tue, 10 May 2011 00:25:32 +0000 (17:25 -0700)

committer Nigel Tao <nigeltao@golang.org>

Tue, 10 May 2011 00:25:32 +0000 (17:25 -0700)
author Nigel Tao <nigeltao@golang.org>
Tue, 10 May 2011 00:25:32 +0000 (17:25 -0700)
committer Nigel Tao <nigeltao@golang.org>
Tue, 10 May 2011 00:25:32 +0000 (17:25 -0700)
diff --git a/src/pkg/image/jpeg/idct.go b/src/pkg/image/jpeg/idct.go

index e5a2f40f5db7bcce89e9ddc715c2e408c4ab7dbd..b387dfdffd171e4617ec21b42846d47d0eadae8b 100644 (file)
--- a/src/pkg/image/jpeg/idct.go
+++ b/src/pkg/image/jpeg/idct.go
@@ -2,6 +2,8 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
+package jpeg
+
  // This is a Go translation of idct.c from
  //
  // http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_IEC_13818-4_2004_Conformance_Testing/Video/verifier/mpeg2decode_960109.tar.gz
@@ -35,8 +37,6 @@
   *
   */
  
-package jpeg
-
  const (
         w1 = 2841 // 2048*sqrt(2)*cos(1*pi/16)
         w2 = 2676 // 2048*sqrt(2)*cos(2*pi/16)
@@ -55,41 +55,45 @@ const (
         r2 = 181 // 256/sqrt(2)
  )
  
-// 2-D Inverse Discrete Cosine Transformation, followed by a +128 level shift.
+// idct performs a 2-D Inverse Discrete Cosine Transformation, followed by a
+// +128 level shift and a clip to [0, 255], writing the results to dst.
+// stride is the number of elements between successive rows of dst.
  //
-// The input coefficients should already have been multiplied by the appropriate quantization table.
-// We use fixed-point computation, with the number of bits for the fractional component varying over the
-// intermediate stages. The final values are expected to range within [0, 255], after a +128 level shift.
+// The input coefficients should already have been multiplied by the
+// appropriate quantization table. We use fixed-point computation, with the
+// number of bits for the fractional component varying over the intermediate
+// stages.
  //
-// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the discrete W transform and
-// for the discrete Fourier transform", IEEE Trans. on ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
-func idct(b *block) {
+// For more on the actual algorithm, see Z. Wang, "Fast algorithms for the
+// discrete W transform and for the discrete Fourier transform", IEEE Trans. on
+// ASSP, Vol. ASSP- 32, pp. 803-816, Aug. 1984.
+func idct(dst []byte, stride int, src *block) {
         // Horizontal 1-D IDCT.
         for y := 0; y < 8; y++ {
                 // If all the AC components are zero, then the IDCT is trivial.
-               if b[y*8+1] == 0 && b[y*8+2] == 0 && b[y*8+3] == 0 &&
-                       b[y*8+4] == 0 && b[y*8+5] == 0 && b[y*8+6] == 0 && b[y*8+7] == 0 {
-                       dc := b[y*8+0] << 3
-                       b[y*8+0] = dc
-                       b[y*8+1] = dc
-                       b[y*8+2] = dc
-                       b[y*8+3] = dc
-                       b[y*8+4] = dc
-                       b[y*8+5] = dc
-                       b[y*8+6] = dc
-                       b[y*8+7] = dc
+               if src[y*8+1] == 0 && src[y*8+2] == 0 && src[y*8+3] == 0 &&
+                       src[y*8+4] == 0 && src[y*8+5] == 0 && src[y*8+6] == 0 && src[y*8+7] == 0 {
+                       dc := src[y*8+0] << 3
+                       src[y*8+0] = dc
+                       src[y*8+1] = dc
+                       src[y*8+2] = dc
+                       src[y*8+3] = dc
+                       src[y*8+4] = dc
+                       src[y*8+5] = dc
+                       src[y*8+6] = dc
+                       src[y*8+7] = dc
                         continue
                 }
  
                 // Prescale.
-               x0 := (b[y*8+0] << 11) + 128
-               x1 := b[y*8+4] << 11
-               x2 := b[y*8+6]
-               x3 := b[y*8+2]
-               x4 := b[y*8+1]
-               x5 := b[y*8+7]
-               x6 := b[y*8+5]
-               x7 := b[y*8+3]
+               x0 := (src[y*8+0] << 11) + 128
+               x1 := src[y*8+4] << 11
+               x2 := src[y*8+6]
+               x3 := src[y*8+2]
+               x4 := src[y*8+1]
+               x5 := src[y*8+7]
+               x6 := src[y*8+5]
+               x7 := src[y*8+3]
  
                 // Stage 1.
                 x8 := w7 * (x4 + x5)
@@ -119,14 +123,14 @@ func idct(b *block) {
                 x4 = (r2*(x4-x5) + 128) >> 8
  
                 // Stage 4.
-               b[8*y+0] = (x7 + x1) >> 8
-               b[8*y+1] = (x3 + x2) >> 8
-               b[8*y+2] = (x0 + x4) >> 8
-               b[8*y+3] = (x8 + x6) >> 8
-               b[8*y+4] = (x8 - x6) >> 8
-               b[8*y+5] = (x0 - x4) >> 8
-               b[8*y+6] = (x3 - x2) >> 8
-               b[8*y+7] = (x7 - x1) >> 8
+               src[8*y+0] = (x7 + x1) >> 8
+               src[8*y+1] = (x3 + x2) >> 8
+               src[8*y+2] = (x0 + x4) >> 8
+               src[8*y+3] = (x8 + x6) >> 8
+               src[8*y+4] = (x8 - x6) >> 8
+               src[8*y+5] = (x0 - x4) >> 8
+               src[8*y+6] = (x3 - x2) >> 8
+               src[8*y+7] = (x7 - x1) >> 8
         }
  
         // Vertical 1-D IDCT.
@@ -136,14 +140,14 @@ func idct(b *block) {
                 // we do not bother to check for the all-zero case.
  
                 // Prescale.
-               y0 := (b[8*0+x] << 8) + 8192
-               y1 := b[8*4+x] << 8
-               y2 := b[8*6+x]
-               y3 := b[8*2+x]
-               y4 := b[8*1+x]
-               y5 := b[8*7+x]
-               y6 := b[8*5+x]
-               y7 := b[8*3+x]
+               y0 := (src[8*0+x] << 8) + 8192
+               y1 := src[8*4+x] << 8
+               y2 := src[8*6+x]
+               y3 := src[8*2+x]
+               y4 := src[8*1+x]
+               y5 := src[8*7+x]
+               y6 := src[8*5+x]
+               y7 := src[8*3+x]
  
                 // Stage 1.
                 y8 := w7*(y4+y5) + 4
@@ -173,18 +177,28 @@ func idct(b *block) {
                 y4 = (r2*(y4-y5) + 128) >> 8
  
                 // Stage 4.
-               b[8*0+x] = (y7 + y1) >> 14
-               b[8*1+x] = (y3 + y2) >> 14
-               b[8*2+x] = (y0 + y4) >> 14
-               b[8*3+x] = (y8 + y6) >> 14
-               b[8*4+x] = (y8 - y6) >> 14
-               b[8*5+x] = (y0 - y4) >> 14
-               b[8*6+x] = (y3 - y2) >> 14
-               b[8*7+x] = (y7 - y1) >> 14
+               src[8*0+x] = (y7 + y1) >> 14
+               src[8*1+x] = (y3 + y2) >> 14
+               src[8*2+x] = (y0 + y4) >> 14
+               src[8*3+x] = (y8 + y6) >> 14
+               src[8*4+x] = (y8 - y6) >> 14
+               src[8*5+x] = (y0 - y4) >> 14
+               src[8*6+x] = (y3 - y2) >> 14
+               src[8*7+x] = (y7 - y1) >> 14
         }
  
-       // Level shift.
-       for i := range *b {
-               b[i] += 128
+       // Level shift by +128, clip to [0, 255], and write to dst.
+       for y := 0; y < 8; y++ {
+               for x := 0; x < 8; x++ {
+                       c := src[y*8+x]
+                       if c < -128 {
+                               c = 0
+                       } else if c > 127 {
+                               c = 255
+                       } else {
+                               c += 128
+                       }
+                       dst[y*stride+x] = uint8(c)
+               }
         }
  }
diff --git a/src/pkg/image/jpeg/reader.go b/src/pkg/image/jpeg/reader.go

index 21a6fff969824163fce9be1b68a98f61daef40e6..74df9ac4b76400ae17021ce0f98c7048d2959a98 100644 (file)
--- a/src/pkg/image/jpeg/reader.go
+++ b/src/pkg/image/jpeg/reader.go
@@ -96,7 +96,6 @@ type decoder struct {
         huff          [maxTc + 1][maxTh + 1]huffman
         quant         [maxTq + 1]block
         b             bits
-       blocks        [nComponent][maxH * maxV]block
         tmp           [1024]byte
  }
  
@@ -182,45 +181,6 @@ func (d *decoder) processDQT(n int) os.Error {
         return nil
  }
  
-// Clip x to the range [0, 255] inclusive.
-func clip(x int) uint8 {
-       if x < 0 {
-               return 0
-       }
-       if x > 255 {
-               return 255
-       }
-       return uint8(x)
-}
-
-// Store the MCU to the image.
-func (d *decoder) storeMCU(mx, my int) {
-       h0, v0 := d.comps[0].h, d.comps[0].v
-       // Store the luma blocks.
-       for v := 0; v < v0; v++ {
-               for h := 0; h < h0; h++ {
-                       p := 8 * ((v0*my+v)*d.img.YStride + (h0*mx + h))
-                       for y := 0; y < 8; y++ {
-                               for x := 0; x < 8; x++ {
-                                       d.img.Y[p] = clip(d.blocks[0][h0*v+h][8*y+x])
-                                       p++
-                               }
-                               p += d.img.YStride - 8
-                       }
-               }
-       }
-       // Store the chroma blocks.
-       p := 8 * (my*d.img.CStride + mx)
-       for y := 0; y < 8; y++ {
-               for x := 0; x < 8; x++ {
-                       d.img.Cb[p] = clip(d.blocks[1][0][8*y+x])
-                       d.img.Cr[p] = clip(d.blocks[2][0][8*y+x])
-                       p++
-               }
-               p += d.img.CStride - 8
-       }
-}
-
  // Specified in section B.2.3.
  func (d *decoder) processSOS(n int) os.Error {
         if n != 4+2*nComponent {
@@ -275,14 +235,18 @@ func (d *decoder) processSOS(n int) os.Error {
         }
  
         mcu, expectedRST := 0, uint8(rst0Marker)
-       var allZeroes block
-       var dc [nComponent]int
+       var (
+               allZeroes, b block
+               dc           [nComponent]int
+       )
         for my := 0; my < myy; my++ {
                 for mx := 0; mx < mxx; mx++ {
                         for i := 0; i < nComponent; i++ {
                                 qt := &d.quant[d.comps[i].tq]
                                 for j := 0; j < d.comps[i].h*d.comps[i].v; j++ {
-                                       d.blocks[i][j] = allZeroes
+                                       // TODO(nigeltao): make this a "var b block" once the compiler's escape
+                                       // analysis is good enough to allocate it on the stack, not the heap.
+                                       b = allZeroes
  
                                         // Decode the DC coefficient, as specified in section F.2.2.1.
                                         value, err := d.decodeHuffman(&d.huff[dcTableClass][scanComps[i].td])
@@ -297,7 +261,7 @@ func (d *decoder) processSOS(n int) os.Error {
                                                 return err
                                         }
                                         dc[i] += dcDelta
-                                       d.blocks[i][j][0] = dc[i] * qt[0]
+                                       b[0] = dc[i] * qt[0]
  
                                         // Decode the AC coefficients, as specified in section F.2.2.2.
                                         for k := 1; k < blockSize; k++ {
@@ -316,7 +280,7 @@ func (d *decoder) processSOS(n int) os.Error {
                                                         if err != nil {
                                                                 return err
                                                         }
-                                                       d.blocks[i][j][unzig[k]] = ac * qt[k]
+                                                       b[unzig[k]] = ac * qt[k]
                                                 } else {
                                                         if val0 != 0x0f {
                                                                 break
@@ -325,10 +289,19 @@ func (d *decoder) processSOS(n int) os.Error {
                                                 }
                                         }
  
-                                       idct(&d.blocks[i][j])
+                                       // Perform the inverse DCT and store the MCU component to the image.
+                                       switch i {
+                                       case 0:
+                                               mx0 := h0*mx + (j % 2)
+                                               my0 := v0*my + (j / 2)
+                                               idct(d.img.Y[8*(my0*d.img.YStride+mx0):], d.img.YStride, &b)
+                                       case 1:
+                                               idct(d.img.Cb[8*(my*d.img.CStride+mx):], d.img.CStride, &b)
+                                       case 2:
+                                               idct(d.img.Cr[8*(my*d.img.CStride+mx):], d.img.CStride, &b)
+                                       }
                                 } // for j
                         } // for i
-                       d.storeMCU(mx, my)
                         mcu++
                         if d.ri > 0 && mcu%d.ri == 0 && mcu < mxx*myy {
                                 // A more sophisticated decoder could use RST[0-7] markers to resynchronize from corrupt input,
author	Nigel Tao <nigeltao@golang.org>
	Tue, 10 May 2011 00:25:32 +0000 (17:25 -0700)
committer	Nigel Tao <nigeltao@golang.org>
	Tue, 10 May 2011 00:25:32 +0000 (17:25 -0700)
src/pkg/image/jpeg/idct.go		patch \| blob \| history
src/pkg/image/jpeg/reader.go		patch \| blob \| history