img3          *image.YCbCr
        ri            int // Restart Interval.
        nComp         int
+       progressive   bool
+       eobRun        uint16 // End-of-Band run, specified in section G.1.2.2.
        comp          [nColorComponent]component
+       progCoeffs    [nColorComponent][]block // Saved state between progressive-mode scans.
        huff          [maxTc + 1][maxTh + 1]huffman
        quant         [maxTq + 1]block // Quantization tables, in zig-zag order.
        tmp           [1024]byte
        d.img3 = m.SubImage(image.Rect(0, 0, d.width, d.height)).(*image.YCbCr)
 }
 
+// TODO(nigeltao): move processSOS to scan.go.
+
 // Specified in section B.2.3.
 func (d *decoder) processSOS(n int) error {
        if d.nComp == 0 {
                return FormatError("missing SOF marker")
        }
-       if n != 4+2*d.nComp {
-               return UnsupportedError("SOS has wrong length")
+       if n < 6 || 4+2*d.nComp < n || n%2 != 0 {
+               return FormatError("SOS has wrong length")
        }
-       _, err := io.ReadFull(d.r, d.tmp[0:4+2*d.nComp])
+       _, err := io.ReadFull(d.r, d.tmp[:n])
        if err != nil {
                return err
        }
-       if int(d.tmp[0]) != d.nComp {
-               return UnsupportedError("SOS has wrong number of image components")
+       nComp := int(d.tmp[0])
+       if n != 4+2*nComp {
+               return FormatError("SOS length inconsistent with number of components")
        }
        var scan [nColorComponent]struct {
-               td uint8 // DC table selector.
-               ta uint8 // AC table selector.
+               compIndex uint8
+               td        uint8 // DC table selector.
+               ta        uint8 // AC table selector.
        }
-       for i := 0; i < d.nComp; i++ {
+       for i := 0; i < nComp; i++ {
                cs := d.tmp[1+2*i] // Component selector.
-               if cs != d.comp[i].c {
-                       return UnsupportedError("scan components out of order")
+               compIndex := -1
+               for j, comp := range d.comp {
+                       if cs == comp.c {
+                               compIndex = j
+                       }
                }
+               if compIndex < 0 {
+                       return FormatError("unknown component selector")
+               }
+               scan[i].compIndex = uint8(compIndex)
                scan[i].td = d.tmp[2+2*i] >> 4
                scan[i].ta = d.tmp[2+2*i] & 0x0f
        }
+
+       // zigStart and zigEnd are the spectral selection bounds.
+       // ah and al are the successive approximation high and low values.
+       // The spec calls these values Ss, Se, Ah and Al.
+       //
+       // For progressive JPEGs, these are the two more-or-less independent
+       // aspects of progression. Spectral selection progression is when not
+       // all of a block's 64 DCT coefficients are transmitted in one pass.
+       // For example, three passes could transmit coefficient 0 (the DC
+       // component), coefficients 1-5, and coefficients 6-63, in zig-zag
+       // order. Successive approximation is when not all of the bits of a
+       // band of coefficients are transmitted in one pass. For example,
+       // three passes could transmit the 6 most significant bits, followed
+       // by the second-least significant bit, followed by the least
+       // significant bit.
+       //
+       // For baseline JPEGs, these parameters are hard-coded to 0/63/0/0.
+       zigStart, zigEnd, ah, al := 0, blockSize-1, uint(0), uint(0)
+       if d.progressive {
+               zigStart = int(d.tmp[1+2*nComp])
+               zigEnd = int(d.tmp[2+2*nComp])
+               ah = uint(d.tmp[3+2*nComp] >> 4)
+               al = uint(d.tmp[3+2*nComp] & 0x0f)
+               if (zigStart == 0 && zigEnd != 0) || zigStart > zigEnd || blockSize <= zigEnd {
+                       return FormatError("bad spectral selection bounds")
+               }
+               if zigStart != 0 && nComp != 1 {
+                       return FormatError("progressive AC coefficients for more than one component")
+               }
+               if ah != 0 && ah != al+1 {
+                       return FormatError("bad successive approximation values")
+               }
+       }
+
        // mxx and myy are the number of MCUs (Minimum Coded Units) in the image.
        h0, v0 := d.comp[0].h, d.comp[0].v // The h and v values from the Y components.
        mxx := (d.width + 8*h0 - 1) / (8 * h0)
        myy := (d.height + 8*v0 - 1) / (8 * v0)
        if d.img1 == nil && d.img3 == nil {
                d.makeImg(h0, v0, mxx, myy)
+               if d.progressive {
+                       for i := 0; i < nComp; i++ {
+                               compIndex := scan[i].compIndex
+                               d.progCoeffs[compIndex] = make([]block, mxx*myy*d.comp[compIndex].h*d.comp[compIndex].v)
+                       }
+               }
        }
 
+       d.b = bits{}
        mcu, expectedRST := 0, uint8(rst0Marker)
        var (
+               // b is the decoded coefficients, in natural (not zig-zag) order.
                b  block
                dc [nColorComponent]int
+               // mx0 and my0 are the location of the current (in terms of 8x8 blocks).
+               // For example, with 4:2:0 chroma subsampling, the block whose top left
+               // pixel co-ordinates are (16, 8) is the third block in the first row:
+               // mx0 is 2 and my0 is 0, even though the pixel is in the second MCU.
+               // TODO(nigeltao): rename mx0 and my0 to bx and by?
+               mx0, my0   int
+               blockCount int
        )
        for my := 0; my < myy; my++ {
                for mx := 0; mx < mxx; mx++ {
-                       for i := 0; i < d.nComp; i++ {
-                               qt := &d.quant[d.comp[i].tq]
-                               for j := 0; j < d.comp[i].h*d.comp[i].v; j++ {
-                                       // TODO(nigeltao): make this a "var b block" once the compiler's escape
-                                       // analysis is good enough to allocate it on the stack, not the heap.
-                                       // b is in natural (not zig-zag) order.
-                                       b = block{}
-
-                                       // Decode the DC coefficient, as specified in section F.2.2.1.
-                                       value, err := d.decodeHuffman(&d.huff[dcTable][scan[i].td])
-                                       if err != nil {
-                                               return err
-                                       }
-                                       if value > 16 {
-                                               return UnsupportedError("excessive DC component")
+                       for i := 0; i < nComp; i++ {
+                               compIndex := scan[i].compIndex
+                               qt := &d.quant[d.comp[compIndex].tq]
+                               for j := 0; j < d.comp[compIndex].h*d.comp[compIndex].v; j++ {
+                                       // The blocks are traversed one MCU at a time. For 4:2:0 chroma
+                                       // subsampling, there are four Y 8x8 blocks in every 16x16 MCU.
+                                       // For a baseline 32x16 pixel image, the Y blocks visiting order is:
+                                       //      0 1 4 5
+                                       //      2 3 6 7
+                                       //
+                                       // For progressive images, the DC data blocks (zigStart == 0) are traversed
+                                       // as above, but AC data blocks are traversed left to right, top to bottom:
+                                       //      0 1 2 3
+                                       //      4 5 6 7
+                                       //
+                                       // To further complicate matters, there is no AC data for any blocks that
+                                       // are inside the image at the MCU level but outside the image at the pixel
+                                       // level. For example, a 24x16 pixel 4:2:0 progressive image consists of
+                                       // two 16x16 MCUs. The earlier scans will process 8 Y blocks:
+                                       //      0 1 4 5
+                                       //      2 3 6 7
+                                       // The later scans will process only 6 Y blocks:
+                                       //      0 1 2
+                                       //      3 4 5
+                                       if zigStart == 0 {
+                                               mx0, my0 = d.comp[compIndex].h*mx, d.comp[compIndex].v*my
+                                               if h0 == 1 {
+                                                       my0 += j
+                                               } else {
+                                                       mx0 += j % 2
+                                                       my0 += j / 2
+                                               }
+                                       } else {
+                                               q := mxx * d.comp[compIndex].h
+                                               mx0 = blockCount % q
+                                               my0 = blockCount / q
+                                               blockCount++
+                                               if mx0*8 >= d.width || my0*8 >= d.height {
+                                                       continue
+                                               }
                                        }
-                                       dcDelta, err := d.receiveExtend(value)
-                                       if err != nil {
-                                               return err
+
+                                       // Load the previous partially decoded coefficients, if applicable.
+                                       if d.progressive {
+                                               b = d.progCoeffs[compIndex][my0*mxx*d.comp[compIndex].h+mx0]
+                                       } else {
+                                               b = block{}
                                        }
-                                       dc[i] += dcDelta
-                                       b[0] = dc[i] * qt[0]
 
-                                       // Decode the AC coefficients, as specified in section F.2.2.2.
-                                       for zig := 1; zig < blockSize; zig++ {
-                                               value, err := d.decodeHuffman(&d.huff[acTable][scan[i].ta])
-                                               if err != nil {
+                                       if ah != 0 {
+                                               if err := d.refine(&b, &d.huff[acTable][scan[i].ta], zigStart, zigEnd, 1<<al); err != nil {
                                                        return err
                                                }
-                                               val0 := value >> 4
-                                               val1 := value & 0x0f
-                                               if val1 != 0 {
-                                                       zig += int(val0)
-                                                       if zig > blockSize {
-                                                               return FormatError("bad DCT index")
+                                       } else {
+                                               zig := zigStart
+                                               if zig == 0 {
+                                                       zig++
+                                                       // Decode the DC coefficient, as specified in section F.2.2.1.
+                                                       value, err := d.decodeHuffman(&d.huff[dcTable][scan[i].td])
+                                                       if err != nil {
+                                                               return err
+                                                       }
+                                                       if value > 16 {
+                                                               return UnsupportedError("excessive DC component")
                                                        }
-                                                       ac, err := d.receiveExtend(val1)
+                                                       dcDelta, err := d.receiveExtend(value)
                                                        if err != nil {
                                                                return err
                                                        }
-                                                       b[unzig[zig]] = ac * qt[zig]
+                                                       dc[compIndex] += dcDelta
+                                                       b[0] = dc[compIndex] << al
+                                               }
+
+                                               if zig <= zigEnd && d.eobRun > 0 {
+                                                       d.eobRun--
                                                } else {
-                                                       if val0 != 0x0f {
-                                                               break
+                                                       // Decode the AC coefficients, as specified in section F.2.2.2.
+                                                       for ; zig <= zigEnd; zig++ {
+                                                               value, err := d.decodeHuffman(&d.huff[acTable][scan[i].ta])
+                                                               if err != nil {
+                                                                       return err
+                                                               }
+                                                               val0 := value >> 4
+                                                               val1 := value & 0x0f
+                                                               if val1 != 0 {
+                                                                       zig += int(val0)
+                                                                       if zig > zigEnd {
+                                                                               break
+                                                                       }
+                                                                       ac, err := d.receiveExtend(val1)
+                                                                       if err != nil {
+                                                                               return err
+                                                                       }
+                                                                       b[unzig[zig]] = ac << al
+                                                               } else {
+                                                                       if val0 != 0x0f {
+                                                                               d.eobRun = uint16(1 << val0)
+                                                                               if val0 != 0 {
+                                                                                       bits, err := d.decodeBits(int(val0))
+                                                                                       if err != nil {
+                                                                                               return err
+                                                                                       }
+                                                                                       d.eobRun |= uint16(bits)
+                                                                               }
+                                                                               d.eobRun--
+                                                                               break
+                                                                       }
+                                                                       zig += 0x0f
+                                                               }
                                                        }
-                                                       zig += 0x0f
                                                }
                                        }
 
-                                       // Perform the inverse DCT and store the MCU component to the image.
+                                       if d.progressive {
+                                               if zigEnd != blockSize-1 || al != 0 {
+                                                       // We haven't completely decoded this 8x8 block. Save the coefficients.
+                                                       d.progCoeffs[compIndex][my0*mxx*d.comp[compIndex].h+mx0] = b
+                                                       // At this point, we could execute the rest of the loop body to dequantize and
+                                                       // perform the inverse DCT, to save early stages of a progressive image to the
+                                                       // *image.YCbCr buffers (the whole point of progressive encoding), but in Go,
+                                                       // the jpeg.Decode function does not return until the entire image is decoded,
+                                                       // so we "continue" here to avoid wasted computation.
+                                                       continue
+                                               }
+                                       }
+
+                                       // Dequantize, perform the inverse DCT and store the block to the image.
+                                       for zig := 0; zig < blockSize; zig++ {
+                                               b[unzig[zig]] *= qt[zig]
+                                       }
                                        idct(&b)
                                        dst, stride := []byte(nil), 0
                                        if d.nComp == nGrayComponent {
-                                               dst, stride = d.img1.Pix[8*(my*d.img1.Stride+mx):], d.img1.Stride
+                                               dst, stride = d.img1.Pix[8*(my0*d.img1.Stride+mx0):], d.img1.Stride
                                        } else {
-                                               switch i {
+                                               switch compIndex {
                                                case 0:
-                                                       mx0, my0 := h0*mx, v0*my
-                                                       if h0 == 1 {
-                                                               my0 += j
-                                                       } else {
-                                                               mx0 += j % 2
-                                                               my0 += j / 2
-                                                       }
                                                        dst, stride = d.img3.Y[8*(my0*d.img3.YStride+mx0):], d.img3.YStride
                                                case 1:
-                                                       dst, stride = d.img3.Cb[8*(my*d.img3.CStride+mx):], d.img3.CStride
+                                                       dst, stride = d.img3.Cb[8*(my0*d.img3.CStride+mx0):], d.img3.CStride
                                                case 2:
-                                                       dst, stride = d.img3.Cr[8*(my*d.img3.CStride+mx):], d.img3.CStride
+                                                       dst, stride = d.img3.Cr[8*(my0*d.img3.CStride+mx0):], d.img3.CStride
+                                               default:
+                                                       return UnsupportedError("too many components")
                                                }
                                        }
                                        // Level shift by +128, clip to [0, 255], and write to dst.
                                d.b = bits{}
                                // Reset the DC components, as per section F.2.1.3.1.
                                dc = [nColorComponent]int{}
+                               // Reset the progressive decoder state, as per section G.1.2.2.
+                               d.eobRun = 0
                        }
                } // for mx
        } // for my
                }
 
                switch {
-               case marker == sof0Marker: // Start Of Frame (Baseline).
+               case marker == sof0Marker || marker == sof2Marker: // Start Of Frame.
+                       d.progressive = marker == sof2Marker
                        err = d.processSOF(n)
                        if configOnly {
                                return nil, err
                        }
-               case marker == sof2Marker: // Start Of Frame (Progressive).
-                       err = UnsupportedError("progressive mode")
                case marker == dhtMarker: // Define Huffman Table.
                        err = d.processDHT(n)
                case marker == dqtMarker: // Define Quantization Table.
 
--- /dev/null
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package jpeg
+
+import (
+       "bytes"
+       "fmt"
+       "image"
+       "io/ioutil"
+       "os"
+       "testing"
+)
+
+// TestDecodeProgressive tests that decoding the baseline and progressive
+// versions of the same image result in exactly the same pixel data, in YCbCr
+// space for color images, and Y space for grayscale images.
+func TestDecodeProgressive(t *testing.T) {
+       testCases := []string{
+               "../testdata/video-001",
+               "../testdata/video-001.q50.420",
+               "../testdata/video-001.q50.422",
+               "../testdata/video-001.q50.444",
+               "../testdata/video-005.gray.q50",
+       }
+       for _, tc := range testCases {
+               m0, err := decodeFile(tc + ".jpeg")
+               if err != nil {
+                       t.Errorf("%s: %v", tc+".jpeg", err)
+                       continue
+               }
+               m1, err := decodeFile(tc + ".progressive.jpeg")
+               if err != nil {
+                       t.Errorf("%s: %v", tc+".progressive.jpeg", err)
+                       continue
+               }
+               if m0.Bounds() != m1.Bounds() {
+                       t.Errorf("%s: bounds differ: %v and %v", tc, m0.Bounds(), m1.Bounds())
+                       continue
+               }
+               switch m0 := m0.(type) {
+               case *image.YCbCr:
+                       m1 := m1.(*image.YCbCr)
+                       if err := check(m0.Bounds(), m0.Y, m1.Y, m0.YStride, m1.YStride); err != nil {
+                               t.Errorf("%s (Y): %v", tc, err)
+                               continue
+                       }
+                       if err := check(m0.Bounds(), m0.Cb, m1.Cb, m0.CStride, m1.CStride); err != nil {
+                               t.Errorf("%s (Cb): %v", tc, err)
+                               continue
+                       }
+                       if err := check(m0.Bounds(), m0.Cr, m1.Cr, m0.CStride, m1.CStride); err != nil {
+                               t.Errorf("%s (Cr): %v", tc, err)
+                               continue
+                       }
+               case *image.Gray:
+                       m1 := m1.(*image.Gray)
+                       if err := check(m0.Bounds(), m0.Pix, m1.Pix, m0.Stride, m1.Stride); err != nil {
+                               t.Errorf("%s: %v", tc, err)
+                               continue
+                       }
+               default:
+                       t.Errorf("%s: unexpected image type %T", tc, m0)
+                       continue
+               }
+       }
+}
+
+func decodeFile(filename string) (image.Image, error) {
+       f, err := os.Open(filename)
+       if err != nil {
+               return nil, err
+       }
+       defer f.Close()
+       return Decode(f)
+
+}
+
+// check checks that the two pix data are equal, within the given bounds.
+func check(bounds image.Rectangle, pix0, pix1 []byte, stride0, stride1 int) error {
+       if len(pix0) != len(pix1) {
+               return fmt.Errorf("len(pix) %d and %d differ", len(pix0), len(pix1))
+       }
+       if stride0 != stride1 {
+               return fmt.Errorf("strides %d and %d differ", stride0, stride1)
+       }
+       if stride0%8 != 0 {
+               return fmt.Errorf("stride %d is not a multiple of 8", stride0)
+       }
+       // Compare the two pix data, one 8x8 block at a time.
+       for y := 0; y < len(pix0)/stride0; y += 8 {
+               for x := 0; x < stride0; x += 8 {
+                       if x >= bounds.Max.X || y >= bounds.Max.Y {
+                               // We don't care if the two pix data differ if the 8x8 block is
+                               // entirely outside of the image's bounds. For example, this can
+                               // occur with a 4:2:0 chroma subsampling and a 1x1 image. Baseline
+                               // decoding works on the one 16x16 MCU as a whole; progressive
+                               // decoding's first pass works on that 16x16 MCU as a whole but
+                               // refinement passes only process one 8x8 block within the MCU.
+                               continue
+                       }
+
+                       for j := 0; j < 8; j++ {
+                               for i := 0; i < 8; i++ {
+                                       index := (y+j)*stride0 + (x + i)
+                                       if pix0[index] != pix1[index] {
+                                               return fmt.Errorf("blocks at (%d, %d) differ:\n%sand\n%s", x, y,
+                                                       pixString(pix0, stride0, x, y),
+                                                       pixString(pix1, stride1, x, y),
+                                               )
+                                       }
+                               }
+                       }
+               }
+       }
+       return nil
+}
+
+func pixString(pix []byte, stride, x, y int) string {
+       s := bytes.NewBuffer(nil)
+       for j := 0; j < 8; j++ {
+               fmt.Fprintf(s, "\t")
+               for i := 0; i < 8; i++ {
+                       fmt.Fprintf(s, "%02x ", pix[(y+j)*stride+(x+i)])
+               }
+               fmt.Fprintf(s, "\n")
+       }
+       return s.String()
+}
+
+func benchmarkDecode(b *testing.B, filename string) {
+       b.StopTimer()
+       data, err := ioutil.ReadFile(filename)
+       if err != nil {
+               b.Fatal(err)
+       }
+       cfg, err := DecodeConfig(bytes.NewReader(data))
+       if err != nil {
+               b.Fatal(err)
+       }
+       b.SetBytes(int64(cfg.Width * cfg.Height * 4))
+       b.StartTimer()
+       for i := 0; i < b.N; i++ {
+               Decode(bytes.NewReader(data))
+       }
+}
+
+func BenchmarkDecodeBaseline(b *testing.B) {
+       benchmarkDecode(b, "../testdata/video-001.jpeg")
+}
+
+func BenchmarkDecodeProgressive(b *testing.B) {
+       benchmarkDecode(b, "../testdata/video-001.progressive.jpeg")
+}