From b0702bd0dba1451f908f9d503a82a8fd3cf3f2c9 Mon Sep 17 00:00:00 2001 From: Dmitriy Vyukov Date: Thu, 24 May 2012 10:55:50 +0400 Subject: [PATCH] runtime: faster GC mark phase Also bump MaxGcproc to 8. benchmark old ns/op new ns/op delta Parser 3796323000 3763880000 -0.85% Parser-2 3591752500 3518560250 -2.04% Parser-4 3423825250 3334955250 -2.60% Parser-8 3304585500 3267014750 -1.14% Parser-16 3313615750 3286160500 -0.83% Tree 984128500 942501166 -4.23% Tree-2 932564444 883266222 -5.29% Tree-4 835831000 799912777 -4.30% Tree-8 819238500 789717333 -3.73% Tree-16 880837833 837840055 -5.13% Tree2 604698100 579716900 -4.13% Tree2-2 372414500 356765200 -4.20% Tree2-4 187488100 177455900 -5.56% Tree2-8 136315300 102086700 -25.11% Tree2-16 93725900 76705800 -22.18% ParserPause 157441210 166202783 +5.56% ParserPause-2 93842650 85199900 -9.21% ParserPause-4 56844404 53535684 -5.82% ParserPause-8 35739446 30767613 -16.15% ParserPause-16 32718255 27212441 -16.83% TreePause 29610557 29787725 +0.60% TreePause-2 24001659 20674421 -13.86% TreePause-4 15114887 12842781 -15.03% TreePause-8 13128725 10741747 -22.22% TreePause-16 16131360 12506901 -22.47% Tree2Pause 2673350920 2651045280 -0.83% Tree2Pause-2 1796999200 1709350040 -4.88% Tree2Pause-4 1163553320 1090706480 -6.67% Tree2Pause-8 987032520 858916360 -25.11% Tree2Pause-16 864758560 809567480 -6.81% ParserLastPause 280537000 289047000 +3.03% ParserLastPause-2 183030000 166748000 -8.90% ParserLastPause-4 105817000 91552000 -13.48% ParserLastPause-8 65127000 53288000 -18.18% ParserLastPause-16 45258000 38334000 -15.30% TreeLastPause 45072000 51449000 +12.39% TreeLastPause-2 39269000 37866000 -3.57% TreeLastPause-4 23564000 20649000 -12.37% TreeLastPause-8 20881000 15807000 -24.30% TreeLastPause-16 23297000 17309000 -25.70% Tree2LastPause 6046912000 5797120000 -4.13% Tree2LastPause-2 3724034000 3567592000 -4.20% Tree2LastPause-4 1874831000 1774524000 -5.65% Tree2LastPause-8 1363108000 1020809000 -12.79% Tree2LastPause-16 937208000 767019000 -22.18% R=rsc, 0xe2.0x9a.0x9b CC=golang-dev https://golang.org/cl/6223050 --- src/pkg/runtime/malloc.h | 4 +- src/pkg/runtime/mgc0.c | 264 ++++++++++++++++----------------------- 2 files changed, 113 insertions(+), 155 deletions(-) diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h index bc186981cd..065f86a42a 100644 --- a/src/pkg/runtime/malloc.h +++ b/src/pkg/runtime/malloc.h @@ -124,8 +124,8 @@ enum // Max number of threads to run garbage collection. // 2, 3, and 4 are all plausible maximums depending // on the hardware details of the machine. The garbage - // collector scales well to 4 cpus. - MaxGcproc = 4, + // collector scales well to 8 cpus. + MaxGcproc = 8, }; // A generic linked list of blocks. (Typically the block is bigger than sizeof(MLink).) diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 4064f6916b..d718b5aea9 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -13,6 +13,7 @@ enum { Debug = 0, PtrSize = sizeof(void*), DebugMark = 0, // run second pass to check mark + DataBlock = 8*1024, // Four bits per word (see #defines below). wordsPerBitmapWord = sizeof(void*)*8/4, @@ -72,9 +73,9 @@ static int32 gctrace; typedef struct Workbuf Workbuf; struct Workbuf { - Workbuf *next; + LFNode node; // must be first uintptr nobj; - byte *obj[512-2]; + byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)]; }; typedef struct Finalizer Finalizer; @@ -112,21 +113,32 @@ static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); static Workbuf* handoff(Workbuf*); +typedef struct GcRoot GcRoot; +struct GcRoot +{ + byte *p; + uintptr n; +}; + static struct { - Lock fmu; - Workbuf *full; - Lock emu; - Workbuf *empty; + uint64 full; // lock-free list of full blocks + uint64 empty; // lock-free list of empty blocks + byte pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait uint32 nproc; volatile uint32 nwait; volatile uint32 ndone; volatile uint32 debugmarkdone; Note alldone; + ParFor *markfor; ParFor *sweepfor; Lock; byte *chunk; uintptr nchunk; + + GcRoot *roots; + uint32 nroot; + uint32 rootcap; } work; // scanblock scans a block of n bytes starting at pointer b for references @@ -162,7 +174,7 @@ scanblock(byte *b, int64 n) nobj = 0; // number of queued objects // Scanblock helpers pass b==nil. - // The main proc needs to return to make more + // Procs needs to return to make more // calls to scanblock. But if work.nproc==1 then // might as well process blocks as soon as we // have them. @@ -246,6 +258,14 @@ scanblock(byte *b, int64 n) bits = xbits >> shift; found: + // If another proc wants a pointer, give it some. + if(work.nwait > 0 && nobj > 4 && work.full == 0) { + wbuf->nobj = nobj; + wbuf = handoff(wbuf); + nobj = wbuf->nobj; + wp = wbuf->obj + nobj; + } + // Now we have bits, bitp, and shift correct for // obj pointing at the base of the object. // Only care about allocated and not marked. @@ -269,14 +289,6 @@ scanblock(byte *b, int64 n) PREFETCH(obj); - // If another proc wants a pointer, give it some. - if(nobj > 4 && work.nwait > 0 && work.full == nil) { - wbuf->nobj = nobj; - wbuf = handoff(wbuf); - nobj = wbuf->nobj; - wp = wbuf->obj + nobj; - } - // If buffer is full, get a new one. if(wbuf == nil || nobj >= nelem(wbuf->obj)) { if(wbuf != nil) @@ -296,7 +308,8 @@ scanblock(byte *b, int64 n) // Fetch b from the work buffer. if(nobj == 0) { if(!keepworking) { - putempty(wbuf); + if(wbuf) + putempty(wbuf); return; } // Emptied our buffer: refill. @@ -401,53 +414,33 @@ debug_scanblock(byte *b, int64 n) } } +static void +markroot(ParFor *desc, uint32 i) +{ + USED(&desc); + scanblock(work.roots[i].p, work.roots[i].n); +} + // Get an empty work buffer off the work.empty list, // allocating new buffers as needed. static Workbuf* getempty(Workbuf *b) { - if(work.nproc == 1) { - // Put b on full list. - if(b != nil) { - b->next = work.full; - work.full = b; + if(b != nil) + runtime·lfstackpush(&work.full, &b->node); + b = (Workbuf*)runtime·lfstackpop(&work.empty); + if(b == nil) { + // Need to allocate. + runtime·lock(&work); + if(work.nchunk < sizeof *b) { + work.nchunk = 1<<20; + work.chunk = runtime·SysAlloc(work.nchunk); } - // Grab from empty list if possible. - b = work.empty; - if(b != nil) { - work.empty = b->next; - goto haveb; - } - } else { - // Put b on full list. - if(b != nil) { - runtime·lock(&work.fmu); - b->next = work.full; - work.full = b; - runtime·unlock(&work.fmu); - } - // Grab from empty list if possible. - runtime·lock(&work.emu); - b = work.empty; - if(b != nil) - work.empty = b->next; - runtime·unlock(&work.emu); - if(b != nil) - goto haveb; - } - - // Need to allocate. - runtime·lock(&work); - if(work.nchunk < sizeof *b) { - work.nchunk = 1<<20; - work.chunk = runtime·SysAlloc(work.nchunk); + b = (Workbuf*)work.chunk; + work.chunk += sizeof *b; + work.nchunk -= sizeof *b; + runtime·unlock(&work); } - b = (Workbuf*)work.chunk; - work.chunk += sizeof *b; - work.nchunk -= sizeof *b; - runtime·unlock(&work); - -haveb: b->nobj = 0; return b; } @@ -455,19 +448,7 @@ haveb: static void putempty(Workbuf *b) { - if(b == nil) - return; - - if(work.nproc == 1) { - b->next = work.empty; - work.empty = b; - return; - } - - runtime·lock(&work.emu); - b->next = work.empty; - work.empty = b; - runtime·unlock(&work.emu); + runtime·lfstackpush(&work.empty, &b->node); } // Get a full work buffer off the work.full list, or return nil. @@ -475,54 +456,21 @@ static Workbuf* getfull(Workbuf *b) { int32 i; - Workbuf *b1; - if(work.nproc == 1) { - // Put b on empty list. - if(b != nil) { - b->next = work.empty; - work.empty = b; - } - // Grab from full list if possible. - // Since work.nproc==1, no one else is - // going to give us work. - b = work.full; - if(b != nil) - work.full = b->next; + if(b != nil) + runtime·lfstackpush(&work.empty, &b->node); + b = (Workbuf*)runtime·lfstackpop(&work.full); + if(b != nil || work.nproc == 1) return b; - } - - putempty(b); - - // Grab buffer from full list if possible. - for(;;) { - b1 = work.full; - if(b1 == nil) - break; - runtime·lock(&work.fmu); - if(work.full != nil) { - b1 = work.full; - work.full = b1->next; - runtime·unlock(&work.fmu); - return b1; - } - runtime·unlock(&work.fmu); - } runtime·xadd(&work.nwait, +1); for(i=0;; i++) { - b1 = work.full; - if(b1 != nil) { - runtime·lock(&work.fmu); - if(work.full != nil) { - runtime·xadd(&work.nwait, -1); - b1 = work.full; - work.full = b1->next; - runtime·unlock(&work.fmu); - return b1; - } - runtime·unlock(&work.fmu); - continue; + if(work.full != 0) { + runtime·xadd(&work.nwait, -1); + b = (Workbuf*)runtime·lfstackpop(&work.full); + if(b != nil) + return b; + runtime·xadd(&work.nwait, +1); } if(work.nwait == work.nproc) return nil; @@ -555,17 +503,35 @@ handoff(Workbuf *b) m->gcstats.nhandoffcnt += n; // Put b on full list - let first half of b get stolen. - runtime·lock(&work.fmu); - b->next = work.full; - work.full = b; - runtime·unlock(&work.fmu); - + runtime·lfstackpush(&work.full, &b->node); return b1; } -// Scanstack calls scanblock on each of gp's stack segments. static void -scanstack(void (*scanblock)(byte*, int64), G *gp) +addroot(byte *p, uintptr n) +{ + uint32 cap; + GcRoot *new; + + if(work.nroot >= work.rootcap) { + cap = PageSize/sizeof(GcRoot); + if(cap < 2*work.rootcap) + cap = 2*work.rootcap; + new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot)); + if(work.roots != nil) { + runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot)); + runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot)); + } + work.roots = new; + work.rootcap = cap; + } + work.roots[work.nroot].p = p; + work.roots[work.nroot].n = n; + work.nroot++; +} + +static void +addstackroots(G *gp) { M *mp; int32 n; @@ -598,15 +564,13 @@ scanstack(void (*scanblock)(byte*, int64), G *gp) } } - if(Debug > 1) - runtime·printf("scanstack %d %p\n", gp->goid, sp); n = 0; while(stk) { if(sp < guard-StackGuard || (byte*)stk < sp) { runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); runtime·throw("scanstack"); } - scanblock(sp, (byte*)stk - sp); + addroot(sp, (byte*)stk - sp); sp = stk->gobuf.sp; guard = stk->stackguard; stk = (Stktop*)stk->stackbase; @@ -614,10 +578,8 @@ scanstack(void (*scanblock)(byte*, int64), G *gp) } } -// Markfin calls scanblock on the blocks that have finalizers: -// the things pointed at cannot be freed until the finalizers have run. static void -markfin(void *v) +addfinroots(void *v) { uintptr size; @@ -626,30 +588,22 @@ markfin(void *v) runtime·throw("mark - finalizer inconsistency"); // do not mark the finalizer block itself. just mark the things it points at. - scanblock(v, size); -} - -static void -debug_markfin(void *v) -{ - uintptr size; - - if(!runtime·mlookup(v, &v, &size, nil)) - runtime·throw("debug_mark - finalizer inconsistency"); - debug_scanblock(v, size); + addroot(v, size); } -// Mark static void -mark(void (*scan)(byte*, int64)) +addroots(void) { G *gp; FinBlock *fb; + byte *p; + + work.nroot = 0; // mark data+bss. - scan(data, ebss - data); + for(p=data; palllink) { switch(gp->status){ default: @@ -660,27 +614,20 @@ mark(void (*scan)(byte*, int64)) case Grunning: if(gp != g) runtime·throw("mark - world not stopped"); - scanstack(scan, gp); + addstackroots(gp); break; case Grunnable: case Gsyscall: case Gwaiting: - scanstack(scan, gp); + addstackroots(gp); break; } } - // mark things pointed at by objects with finalizers - if(scan == debug_scanblock) - runtime·walkfintab(debug_markfin); - else - runtime·walkfintab(markfin); + runtime·walkfintab(addfinroots); for(fb=allfin; fb; fb=fb->alllink) - scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); - - // in multiproc mode, join in the queued work. - scan(nil, 0); + addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); } static bool @@ -825,6 +772,9 @@ sweepspan(ParFor *desc, uint32 idx) void runtime·gchelper(void) { + // parallel mark for over gc roots + runtime·parfordo(work.markfor); + // help other threads scan secondary blocks scanblock(nil, 0); if(DebugMark) { @@ -902,6 +852,7 @@ runtime·gc(int32 force) uint64 heap0, heap1, obj0, obj1; byte *p; GCStats stats; + uint32 i; // The gc is turned off (via enablegc) until // the bootstrap has completed. @@ -953,6 +904,10 @@ runtime·gc(int32 force) work.ndone = 0; work.debugmarkdone = 0; work.nproc = runtime·gcprocs(); + addroots(); + if(work.markfor == nil) + work.markfor = runtime·parforalloc(MaxGcproc); + runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot); if(work.sweepfor == nil) work.sweepfor = runtime·parforalloc(MaxGcproc); runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan); @@ -961,9 +916,12 @@ runtime·gc(int32 force) runtime·helpgc(work.nproc); } - mark(scanblock); + runtime·parfordo(work.markfor); + scanblock(nil, 0); + if(DebugMark) { - mark(debug_scanblock); + for(i=0; i