]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: faster GC mark phase
authorDmitriy Vyukov <dvyukov@google.com>
Thu, 24 May 2012 06:55:50 +0000 (10:55 +0400)
committerDmitriy Vyukov <dvyukov@google.com>
Thu, 24 May 2012 06:55:50 +0000 (10:55 +0400)
Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050

src/pkg/runtime/malloc.h
src/pkg/runtime/mgc0.c

index bc186981cd3307dc22ed983cee48507418431201..065f86a42a41d5b1578f63ba592913c19edc82b8 100644 (file)
@@ -124,8 +124,8 @@ enum
        // Max number of threads to run garbage collection.
        // 2, 3, and 4 are all plausible maximums depending
        // on the hardware details of the machine.  The garbage
-       // collector scales well to 4 cpus.
-       MaxGcproc = 4,
+       // collector scales well to 8 cpus.
+       MaxGcproc = 8,
 };
 
 // A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
index 4064f6916b48982ffa88fcba3c650d41a229d76b..d718b5aea96689dd1548f8ca5912635ecb2b10d1 100644 (file)
@@ -13,6 +13,7 @@ enum {
        Debug = 0,
        PtrSize = sizeof(void*),
        DebugMark = 0,  // run second pass to check mark
+       DataBlock = 8*1024,
 
        // Four bits per word (see #defines below).
        wordsPerBitmapWord = sizeof(void*)*8/4,
@@ -72,9 +73,9 @@ static int32 gctrace;
 typedef struct Workbuf Workbuf;
 struct Workbuf
 {
-       Workbuf *next;
+       LFNode node; // must be first
        uintptr nobj;
-       byte *obj[512-2];
+       byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)];
 };
 
 typedef struct Finalizer Finalizer;
@@ -112,21 +113,32 @@ static Workbuf* getfull(Workbuf*);
 static void    putempty(Workbuf*);
 static Workbuf* handoff(Workbuf*);
 
+typedef struct GcRoot GcRoot;
+struct GcRoot
+{
+       byte *p;
+       uintptr n;
+};
+
 static struct {
-       Lock fmu;
-       Workbuf *full;
-       Lock emu;
-       Workbuf *empty;
+       uint64  full;  // lock-free list of full blocks
+       uint64  empty; // lock-free list of empty blocks
+       byte    pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
        uint32  nproc;
        volatile uint32 nwait;
        volatile uint32 ndone;
        volatile uint32 debugmarkdone;
        Note    alldone;
+       ParFor  *markfor;
        ParFor  *sweepfor;
 
        Lock;
        byte    *chunk;
        uintptr nchunk;
+
+       GcRoot  *roots;
+       uint32  nroot;
+       uint32  rootcap;
 } work;
 
 // scanblock scans a block of n bytes starting at pointer b for references
@@ -162,7 +174,7 @@ scanblock(byte *b, int64 n)
        nobj = 0;  // number of queued objects
 
        // Scanblock helpers pass b==nil.
-       // The main proc needs to return to make more
+       // Procs needs to return to make more
        // calls to scanblock.  But if work.nproc==1 then
        // might as well process blocks as soon as we
        // have them.
@@ -246,6 +258,14 @@ scanblock(byte *b, int64 n)
                        bits = xbits >> shift;
 
                found:
+                       // If another proc wants a pointer, give it some.
+                       if(work.nwait > 0 && nobj > 4 && work.full == 0) {
+                               wbuf->nobj = nobj;
+                               wbuf = handoff(wbuf);
+                               nobj = wbuf->nobj;
+                               wp = wbuf->obj + nobj;
+                       }
+
                        // Now we have bits, bitp, and shift correct for
                        // obj pointing at the base of the object.
                        // Only care about allocated and not marked.
@@ -269,14 +289,6 @@ scanblock(byte *b, int64 n)
 
                        PREFETCH(obj);
 
-                       // If another proc wants a pointer, give it some.
-                       if(nobj > 4 && work.nwait > 0 && work.full == nil) {
-                               wbuf->nobj = nobj;
-                               wbuf = handoff(wbuf);
-                               nobj = wbuf->nobj;
-                               wp = wbuf->obj + nobj;
-                       }
-
                        // If buffer is full, get a new one.
                        if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
                                if(wbuf != nil)
@@ -296,7 +308,8 @@ scanblock(byte *b, int64 n)
                // Fetch b from the work buffer.
                if(nobj == 0) {
                        if(!keepworking) {
-                               putempty(wbuf);
+                               if(wbuf)
+                                       putempty(wbuf);
                                return;
                        }
                        // Emptied our buffer: refill.
@@ -401,53 +414,33 @@ debug_scanblock(byte *b, int64 n)
        }
 }
 
+static void
+markroot(ParFor *desc, uint32 i)
+{
+       USED(&desc);
+       scanblock(work.roots[i].p, work.roots[i].n);
+}
+
 // Get an empty work buffer off the work.empty list,
 // allocating new buffers as needed.
 static Workbuf*
 getempty(Workbuf *b)
 {
-       if(work.nproc == 1) {
-               // Put b on full list.
-               if(b != nil) {
-                       b->next = work.full;
-                       work.full = b;
+       if(b != nil)
+               runtime·lfstackpush(&work.full, &b->node);
+       b = (Workbuf*)runtime·lfstackpop(&work.empty);
+       if(b == nil) {
+               // Need to allocate.
+               runtime·lock(&work);
+               if(work.nchunk < sizeof *b) {
+                       work.nchunk = 1<<20;
+                       work.chunk = runtime·SysAlloc(work.nchunk);
                }
-               // Grab from empty list if possible.
-               b = work.empty;
-               if(b != nil) {
-                       work.empty = b->next;
-                       goto haveb;
-               }
-       } else {
-               // Put b on full list.
-               if(b != nil) {
-                       runtime·lock(&work.fmu);
-                       b->next = work.full;
-                       work.full = b;
-                       runtime·unlock(&work.fmu);
-               }
-               // Grab from empty list if possible.
-               runtime·lock(&work.emu);
-               b = work.empty;
-               if(b != nil)
-                       work.empty = b->next;
-               runtime·unlock(&work.emu);
-               if(b != nil)
-                       goto haveb;
-       }
-
-       // Need to allocate.
-       runtime·lock(&work);
-       if(work.nchunk < sizeof *b) {
-               work.nchunk = 1<<20;
-               work.chunk = runtime·SysAlloc(work.nchunk);
+               b = (Workbuf*)work.chunk;
+               work.chunk += sizeof *b;
+               work.nchunk -= sizeof *b;
+               runtime·unlock(&work);
        }
-       b = (Workbuf*)work.chunk;
-       work.chunk += sizeof *b;
-       work.nchunk -= sizeof *b;
-       runtime·unlock(&work);
-
-haveb:
        b->nobj = 0;
        return b;
 }
@@ -455,19 +448,7 @@ haveb:
 static void
 putempty(Workbuf *b)
 {
-       if(b == nil)
-               return;
-
-       if(work.nproc == 1) {
-               b->next = work.empty;
-               work.empty = b;
-               return;
-       }
-
-       runtime·lock(&work.emu);
-       b->next = work.empty;
-       work.empty = b;
-       runtime·unlock(&work.emu);
+       runtime·lfstackpush(&work.empty, &b->node);
 }
 
 // Get a full work buffer off the work.full list, or return nil.
@@ -475,54 +456,21 @@ static Workbuf*
 getfull(Workbuf *b)
 {
        int32 i;
-       Workbuf *b1;
 
-       if(work.nproc == 1) {
-               // Put b on empty list.
-               if(b != nil) {
-                       b->next = work.empty;
-                       work.empty = b;
-               }
-               // Grab from full list if possible.
-               // Since work.nproc==1, no one else is
-               // going to give us work.
-               b = work.full;
-               if(b != nil)
-                       work.full = b->next;
+       if(b != nil)
+               runtime·lfstackpush(&work.empty, &b->node);
+       b = (Workbuf*)runtime·lfstackpop(&work.full);
+       if(b != nil || work.nproc == 1)
                return b;
-       }
-
-       putempty(b);
-
-       // Grab buffer from full list if possible.
-       for(;;) {
-               b1 = work.full;
-               if(b1 == nil)
-                       break;
-               runtime·lock(&work.fmu);
-               if(work.full != nil) {
-                       b1 = work.full;
-                       work.full = b1->next;
-                       runtime·unlock(&work.fmu);
-                       return b1;
-               }
-               runtime·unlock(&work.fmu);
-       }
 
        runtime·xadd(&work.nwait, +1);
        for(i=0;; i++) {
-               b1 = work.full;
-               if(b1 != nil) {
-                       runtime·lock(&work.fmu);
-                       if(work.full != nil) {
-                               runtime·xadd(&work.nwait, -1);
-                               b1 = work.full;
-                               work.full = b1->next;
-                               runtime·unlock(&work.fmu);
-                               return b1;
-                       }
-                       runtime·unlock(&work.fmu);
-                       continue;
+               if(work.full != 0) {
+                       runtime·xadd(&work.nwait, -1);
+                       b = (Workbuf*)runtime·lfstackpop(&work.full);
+                       if(b != nil)
+                               return b;
+                       runtime·xadd(&work.nwait, +1);
                }
                if(work.nwait == work.nproc)
                        return nil;
@@ -555,17 +503,35 @@ handoff(Workbuf *b)
        m->gcstats.nhandoffcnt += n;
 
        // Put b on full list - let first half of b get stolen.
-       runtime·lock(&work.fmu);
-       b->next = work.full;
-       work.full = b;
-       runtime·unlock(&work.fmu);
-
+       runtime·lfstackpush(&work.full, &b->node);
        return b1;
 }
 
-// Scanstack calls scanblock on each of gp's stack segments.
 static void
-scanstack(void (*scanblock)(byte*, int64), G *gp)
+addroot(byte *p, uintptr n)
+{
+       uint32 cap;
+       GcRoot *new;
+
+       if(work.nroot >= work.rootcap) {
+               cap = PageSize/sizeof(GcRoot);
+               if(cap < 2*work.rootcap)
+                       cap = 2*work.rootcap;
+               new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot));
+               if(work.roots != nil) {
+                       runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot));
+                       runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot));
+               }
+               work.roots = new;
+               work.rootcap = cap;
+       }
+       work.roots[work.nroot].p = p;
+       work.roots[work.nroot].n = n;
+       work.nroot++;
+}
+
+static void
+addstackroots(G *gp)
 {
        M *mp;
        int32 n;
@@ -598,15 +564,13 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
                }
        }
 
-       if(Debug > 1)
-               runtime·printf("scanstack %d %p\n", gp->goid, sp);
        n = 0;
        while(stk) {
                if(sp < guard-StackGuard || (byte*)stk < sp) {
                        runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
                        runtime·throw("scanstack");
                }
-               scanblock(sp, (byte*)stk - sp);
+               addroot(sp, (byte*)stk - sp);
                sp = stk->gobuf.sp;
                guard = stk->stackguard;
                stk = (Stktop*)stk->stackbase;
@@ -614,10 +578,8 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
        }
 }
 
-// Markfin calls scanblock on the blocks that have finalizers:
-// the things pointed at cannot be freed until the finalizers have run.
 static void
-markfin(void *v)
+addfinroots(void *v)
 {
        uintptr size;
 
@@ -626,30 +588,22 @@ markfin(void *v)
                runtime·throw("mark - finalizer inconsistency");
 
        // do not mark the finalizer block itself.  just mark the things it points at.
-       scanblock(v, size);
-}
-
-static void
-debug_markfin(void *v)
-{
-       uintptr size;
-
-       if(!runtime·mlookup(v, &v, &size, nil))
-               runtime·throw("debug_mark - finalizer inconsistency");
-       debug_scanblock(v, size);
+       addroot(v, size);
 }
 
-// Mark
 static void
-mark(void (*scan)(byte*, int64))
+addroots(void)
 {
        G *gp;
        FinBlock *fb;
+       byte *p;
+
+       work.nroot = 0;
 
        // mark data+bss.
-       scan(data, ebss - data);
+       for(p=data; p<ebss; p+=DataBlock)
+               addroot(p, p+DataBlock < ebss ? DataBlock : ebss-p);
 
-       // mark stacks
        for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
                switch(gp->status){
                default:
@@ -660,27 +614,20 @@ mark(void (*scan)(byte*, int64))
                case Grunning:
                        if(gp != g)
                                runtime·throw("mark - world not stopped");
-                       scanstack(scan, gp);
+                       addstackroots(gp);
                        break;
                case Grunnable:
                case Gsyscall:
                case Gwaiting:
-                       scanstack(scan, gp);
+                       addstackroots(gp);
                        break;
                }
        }
 
-       // mark things pointed at by objects with finalizers
-       if(scan == debug_scanblock)
-               runtime·walkfintab(debug_markfin);
-       else
-               runtime·walkfintab(markfin);
+       runtime·walkfintab(addfinroots);
 
        for(fb=allfin; fb; fb=fb->alllink)
-               scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
-
-       // in multiproc mode, join in the queued work.
-       scan(nil, 0);
+               addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
 }
 
 static bool
@@ -825,6 +772,9 @@ sweepspan(ParFor *desc, uint32 idx)
 void
 runtime·gchelper(void)
 {
+       // parallel mark for over gc roots
+       runtime·parfordo(work.markfor);
+       // help other threads scan secondary blocks
        scanblock(nil, 0);
 
        if(DebugMark) {
@@ -902,6 +852,7 @@ runtime·gc(int32 force)
        uint64 heap0, heap1, obj0, obj1;
        byte *p;
        GCStats stats;
+       uint32 i;
 
        // The gc is turned off (via enablegc) until
        // the bootstrap has completed.
@@ -953,6 +904,10 @@ runtime·gc(int32 force)
        work.ndone = 0;
        work.debugmarkdone = 0;
        work.nproc = runtime·gcprocs();
+       addroots();
+       if(work.markfor == nil)
+               work.markfor = runtime·parforalloc(MaxGcproc);
+       runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
        if(work.sweepfor == nil)
                work.sweepfor = runtime·parforalloc(MaxGcproc);
        runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
@@ -961,9 +916,12 @@ runtime·gc(int32 force)
                runtime·helpgc(work.nproc);
        }
 
-       mark(scanblock);
+       runtime·parfordo(work.markfor);
+       scanblock(nil, 0);
+
        if(DebugMark) {
-               mark(debug_scanblock);
+               for(i=0; i<work.nroot; i++)
+                       debug_scanblock(work.roots[i].p, work.roots[i].n);
                runtime·atomicstore(&work.debugmarkdone, 1);
        }
        t1 = runtime·nanotime();