static void
 scanblock(byte *b, uintptr n, byte *ptrmask)
 {
-       byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], bits8;
-       uintptr i, nobj, size, idx, *bitp, bits, xbits, shift, x, off, cached, scanbufpos;
+       byte *obj, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp, bits, xbits, shift, cached;
+       uintptr i, nobj, size, idx, x, off, scanbufpos;
        intptr ncached;
        Workbuf *wbuf;
        String *str;
        for(i = 0; i < nelem(scanbuf); i++)
                scanbuf[i] = nil;
 
+       ptrbitp = nil;
+       cached = 0;
+       ncached = 0;
+
        // ptrmask can have 3 possible values:
        // 1. nil - obtain pointer mask from GC bitmap.
        // 2. ScanConservatively - don't use any mask, scan conservatively.
                        }
                        ptrmask = ScanConservatively;
                }
-               cached = 0;
-               ncached = 0;
+               // Find bits of the beginning of the object.
+               if(ptrmask == nil) {
+                       off = (uintptr*)b - (uintptr*)arena_start;
+                       ptrbitp = arena_start - off/wordsPerBitmapByte - 1;
+                       shift = (off % wordsPerBitmapByte) * gcBits;
+                       cached = *ptrbitp >> shift;
+                       cached &= ~bitBoundary;
+                       ncached = (8 - shift)/gcBits;
+               }
                for(i = 0; i < n; i += PtrSize) {
                        obj = nil;
                        // Find bits for this word.
                                // Consult GC bitmap.
                                if(ncached <= 0) {
                                        // Refill cache.
-                                       off = (uintptr*)(b+i) - (uintptr*)arena_start;
-                                       bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-                                       shift = (off % wordsPerBitmapWord) * gcBits;
-                                       cached = *bitp >> shift;
-                                       ncached = (PtrSize*8 - shift)/gcBits;
+                                       cached = *--ptrbitp;
+                                       ncached = 2;
                                }
                                bits = cached;
                                cached >>= gcBits;
                                ncached--;
-                               if(i != 0 && (bits&bitBoundary) != 0)
+                               if((bits&bitBoundary) != 0)
                                        break; // reached beginning of the next object
                                bits = (bits>>2)&BitsMask;
                                if(bits == BitsDead)
                        // Find the next pair of bits.
                        if(ptrmask == nil) {
                                if(ncached <= 0) {
-                                       off = (uintptr*)(b+i+PtrSize) - (uintptr*)arena_start;
-                                       bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-                                       shift = (off % wordsPerBitmapWord) * gcBits;
-                                       cached = *bitp >> shift;
-                                       ncached = (PtrSize*8 - shift)/gcBits;
+                                       // Refill cache.
+                                       cached = *--ptrbitp;
+                                       ncached = 2;
                                }
                                bits = (cached>>2)&BitsMask;
                        } else
 
                        if(bits == BitsSlice) {
                                i += 2*PtrSize;
-                               cached >>= 2*gcBits;
-                               ncached -= 2;
+                               if(ncached == 2)
+                                       ncached = 0;
+                               else if(ptrmask == nil) {
+                                       // Refill cache and consume one quadruple.
+                                       cached = *--ptrbitp;
+                                       cached >>= gcBits;
+                                       ncached = 1;
+                               }
                        } else {
                                i += PtrSize;
                                cached >>= gcBits;
                                continue;
                        // Mark the object.
                        off = (uintptr*)obj - (uintptr*)arena_start;
-                       bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-                       shift = (off % wordsPerBitmapWord) * gcBits;
+                       bitp = arena_start - off/wordsPerBitmapByte - 1;
+                       shift = (off % wordsPerBitmapByte) * gcBits;
                        xbits = *bitp;
                        bits = (xbits >> shift) & bitMask;
                        if((bits&bitBoundary) == 0) {
-                               // Not a beginning of a block, check if we have block boundary in xbits.
-                               while(shift > 0) {
-                                       obj -= PtrSize;
-                                       shift -= gcBits;
-                                       bits = (xbits >> shift) & bitMask;
-                                       if((bits&bitBoundary) != 0)
-                                               goto havebits;
-                               }
-                               // Otherwise consult span table to find the block beginning.
+                               // Not a beginning of a block, consult span table to find the block beginning.
                                k = (uintptr)obj>>PageShift;
                                x = k;
                                x -= (uintptr)arena_start>>PageShift;
                                goto markobj;
                        }
 
-               havebits:
                        // Now we have bits, bitp, and shift correct for
                        // obj pointing at the base of the object.
                        // Only care about not marked objects.
                        // For 8-byte objects we use non-atomic store, if the other
                        // quadruple is already marked. Otherwise we resort to CAS
                        // loop for marking.
-                       bits8 = xbits>>(shift&~7);
-                       if((bits8&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
+                       if((xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
                                work.nproc == 1)
-                               ((uint8*)bitp)[shift/8] = bits8 | (bitMarked<<(shift&7));
-                       else {
-                               for(;;) {
-                                       if(runtime·casp((void**)bitp, (void*)xbits, (void*)(xbits|(bitMarked<<shift))))
-                                               break;
-                                       xbits = *bitp;
-                                       bits = (xbits>>shift) & bitMask;
-                                       if((bits&bitMarked) != 0)
-                                               break;
-                               }
-                               if((bits&bitMarked) != 0)
-                                       continue;
-                       }
+                               *bitp = xbits | (bitMarked<<shift);
+                       else
+                               runtime·atomicor8(bitp, bitMarked<<shift);
+
                        if(((xbits>>(shift+2))&BitsMask) == BitsDead)
                                continue;  // noscan object
 
 runtime·MSpan_Sweep(MSpan *s, bool preserve)
 {
        int32 cl, n, npages, nfree;
-       uintptr size, off, *bitp, shift, xbits, bits;
+       uintptr size, off, step;
        uint32 sweepgen;
-       byte *p;
+       byte *p, *bitp, shift, xbits, bits;
        MCache *c;
        byte *arena_start;
        MLink head, *end, *link;
        // Mark any free objects in this span so we don't collect them.
        for(link = s->freelist; link != nil; link = link->next) {
                off = (uintptr*)link - (uintptr*)arena_start;
-               bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-               shift = (off % wordsPerBitmapWord) * gcBits;
+               bitp = arena_start - off/wordsPerBitmapByte - 1;
+               shift = (off % wordsPerBitmapByte) * gcBits;
                *bitp |= bitMarked<<shift;
        }
 
                // A finalizer can be set for an inner byte of an object, find object beginning.
                p = (byte*)(s->start << PageShift) + special->offset/size*size;
                off = (uintptr*)p - (uintptr*)arena_start;
-               bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-               shift = (off % wordsPerBitmapWord) * gcBits;
+               bitp = arena_start - off/wordsPerBitmapByte - 1;
+               shift = (off % wordsPerBitmapByte) * gcBits;
                bits = (*bitp>>shift) & bitMask;
                if((bits&bitMarked) == 0) {
                        // Find the exact byte for which the special was setup
        // This thread owns the span now, so it can manipulate
        // the block bitmap without atomic operations.
        p = (byte*)(s->start << PageShift);
+       // Find bits for the beginning of the span.
+       off = (uintptr*)p - (uintptr*)arena_start;
+       bitp = arena_start - off/wordsPerBitmapByte - 1;
+       shift = 0;
+       step = size/(PtrSize*wordsPerBitmapByte);
+       // Rewind to the previous quadruple as we move to the next
+       // in the beginning of the loop.
+       bitp += step;
+       if(step == 0) {
+               // 8-byte objects.
+               bitp++;
+               shift = gcBits;
+       }
        for(; n > 0; n--, p += size) {
-               off = (uintptr*)p - (uintptr*)arena_start;
-               bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-               shift = (off % wordsPerBitmapWord) * gcBits;
+               bitp -= step;
+               if(step == 0) {
+                       if(shift != 0)
+                               bitp--;
+                       shift = gcBits - shift;
+               }
+
                xbits = *bitp;
                bits = (xbits>>shift) & bitMask;
 
 static byte*
 unrollgcprog1(byte *mask, byte *prog, uintptr *ppos, bool inplace, bool sparse)
 {
-       uintptr *b, off, shift, pos, siz, i;
-       byte *arena_start, *prog1, v;
+       uintptr pos, siz, i, off;
+       byte *arena_start, *prog1, v, *bitp, shift;
 
        arena_start = runtime·mheap.arena_start;
        pos = *ppos;
                                if(inplace) {
                                        // Store directly into GC bitmap.
                                        off = (uintptr*)(mask+pos) - (uintptr*)arena_start;
-                                       b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-                                       shift = (off % wordsPerBitmapWord) * gcBits;
-                                       if((shift%8)==0)
-                                               ((byte*)b)[shift/8] = 0;
-                                       ((byte*)b)[shift/8] |= v<<((shift%8)+2);
+                                       bitp = arena_start - off/wordsPerBitmapByte - 1;
+                                       shift = (off % wordsPerBitmapByte) * gcBits;
+                                       if(shift==0)
+                                               *bitp = 0;
+                                       *bitp |= v<<(shift+2);
                                        pos += PtrSize;
                                } else if(sparse) {
                                        // 4-bits per word
 void
 runtime·unrollgcproginplace_m(void)
 {
-       uintptr size, size0, *b, off, shift, pos;
-       byte *arena_start, *prog;
+       uintptr size, size0, pos, off;
+       byte *arena_start, *prog, *bitp, shift;
        Type *typ;
        void *v;
 
        // Mark first word as bitAllocated.
        arena_start = runtime·mheap.arena_start;
        off = (uintptr*)v - (uintptr*)arena_start;
-       b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-       shift = (off % wordsPerBitmapWord) * gcBits;
-       *b |= bitBoundary<<shift;
+       bitp = arena_start - off/wordsPerBitmapByte - 1;
+       shift = (off % wordsPerBitmapByte) * gcBits;
+       *bitp |= bitBoundary<<shift;
        // Mark word after last as BitsDead.
        if(size0 < size) {
                off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
-               b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-               shift = (off % wordsPerBitmapWord) * gcBits;
-               *b &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
+               bitp = arena_start - off/wordsPerBitmapByte - 1;
+               shift = (off % wordsPerBitmapByte) * gcBits;
+               *bitp &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
        }
 }
 
 void
 runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
 {
-       uintptr *b, *b0, off, shift, x;
-       byte *p;
+       uintptr i, off, step;
+       byte *b;
 
        if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
                runtime·throw("markspan: bad pointer");
 
-       p = v;
-       if(leftover)    // mark a boundary just past end of last block too
-               n++;
-
-       b0 = nil;
-       x = 0;
-       for(; n-- > 0; p += size) {
-               // Okay to use non-atomic ops here, because we control
-               // the entire span, and each bitmap word has bits for only
-               // one span, so no other goroutines are changing these
-               // bitmap words.
-               off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;  // word offset
-               b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-               shift = (off % wordsPerBitmapWord) * gcBits;
-               if(b0 != b) {
-                       if(b0 != nil)
-                               *b0 = x;
-                       b0 = b;
-                       x = 0;
-               }
-               x |= (bitBoundary<<shift) | ((uintptr)BitsDead<<(shift+2));
+       // Find bits of the beginning of the span.
+       off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+       b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+       if((off%wordsPerBitmapByte) != 0)
+               runtime·throw("markspan: unaligned length");
+
+       // Okay to use non-atomic ops here, because we control
+       // the entire span, and each bitmap byte has bits for only
+       // one span, so no other goroutines are changing these bitmap words.
+
+       if(size == PtrSize) {
+               // Possible only on 64-bits (minimal size class is 8 bytes).
+               // Poor man's memset(0x11).
+               if(0x11 != ((bitBoundary+BitsDead)<<gcBits) + (bitBoundary+BitsDead))
+                       runtime·throw("markspan: bad bits");
+               if((n%(wordsPerBitmapByte*PtrSize)) != 0)
+                       runtime·throw("markspan: unaligned length");
+               b = b - n/wordsPerBitmapByte + 1;       // find first byte
+               if(((uintptr)b%PtrSize) != 0)
+                       runtime·throw("markspan: unaligned pointer");
+               for(i = 0; i != n; i += wordsPerBitmapByte*PtrSize, b += PtrSize)
+                       *(uintptr*)b = (uintptr)0x1111111111111111ULL;  // bitBoundary+BitsDead
+               return;
        }
-       *b0 = x;
+
+       if(leftover)
+               n++;    // mark a boundary just past end of last block too
+       step = size/(PtrSize*wordsPerBitmapByte);
+       for(i = 0; i != n; i++, b -= step)
+               *b = bitBoundary|(BitsDead<<2);
 }
 
 // unmark the span of memory at v of length n bytes.
 void
 runtime·unmarkspan(void *v, uintptr n)
 {
-       uintptr *p, *b, off;
+       uintptr off;
+       byte *b;
 
        if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
                runtime·throw("markspan: bad pointer");
 
-       p = v;
-       off = p - (uintptr*)runtime·mheap.arena_start;  // word offset
-       if((off % wordsPerBitmapWord) != 0)
+       off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+       if((off % (PtrSize*wordsPerBitmapByte)) != 0)
                runtime·throw("markspan: unaligned pointer");
-       b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+       b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
        n /= PtrSize;
-       if(n%wordsPerBitmapWord != 0)
+       if(n%(PtrSize*wordsPerBitmapByte) != 0)
                runtime·throw("unmarkspan: unaligned length");
        // Okay to use non-atomic ops here, because we control
        // the entire span, and each bitmap word has bits for only
        // one span, so no other goroutines are changing these
        // bitmap words.
-       n /= wordsPerBitmapWord;
-       runtime·memclr((byte*)(b - n + 1), n*PtrSize);
+       n /= wordsPerBitmapByte;
+       runtime·memclr(b - n + 1, n);
 }
 
 void
        };
        uintptr n;
 
-       n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
+       n = (h->arena_used - h->arena_start) / (PtrSize*wordsPerBitmapByte);
        n = ROUND(n, bitmapChunk);
        n = ROUND(n, PhysPageSize);
        if(h->bitmap_mapped >= n)
 runtime·getgcmask(byte *p, Type *t, byte **mask, uintptr *len)
 {
        Stkframe frame;
-       uintptr i, n, off, bits, shift, *b;
-       byte *base;
+       uintptr i, n, off;
+       byte *base, bits, shift, *b;
 
        *mask = nil;
        *len = 0;
                *mask = runtime·mallocgc(*len, nil, 0);
                for(i = 0; i < n; i += PtrSize) {
                        off = (uintptr*)(base+i) - (uintptr*)runtime·mheap.arena_start;
-                       b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-                       shift = (off % wordsPerBitmapWord) * gcBits;
+                       b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+                       shift = (off % wordsPerBitmapByte) * gcBits;
                        bits = (*b >> (shift+2))&BitsMask;
                        (*mask)[i/PtrSize] = bits;
                }