From: Dmitriy Vyukov Date: Tue, 22 May 2012 17:35:52 +0000 (-0400) Subject: runtime: faster GC sweep phase X-Git-Tag: go1.1rc2~3152 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=845aa1fc2c86d761a96d31de5e168d2a0f76f0da;p=gostls13.git runtime: faster GC sweep phase benchmark old ns/op new ns/op delta garbage.BenchmarkParser 3731065750 3715543750 -0.41% garbage.BenchmarkParser-2 3631299750 3495248500 -3.75% garbage.BenchmarkParser-4 3386486000 3339353000 -1.39% garbage.BenchmarkParser-8 3267632000 3286422500 +0.58% garbage.BenchmarkParser-16 3299203000 3316081750 +0.51% garbage.BenchmarkTree 977532888 919453833 -5.94% garbage.BenchmarkTree-2 919948555 853478000 -7.23% garbage.BenchmarkTree-4 841329000 790207000 -6.08% garbage.BenchmarkTree-8 787792777 740380666 -6.01% garbage.BenchmarkTree-16 899257166 846594555 -5.86% garbage.BenchmarkTree2 574876300 571885800 -0.52% garbage.BenchmarkTree2-2 348162700 345888900 -0.65% garbage.BenchmarkTree2-4 184912500 179137000 -3.22% garbage.BenchmarkTree2-8 104243900 103485600 -0.73% garbage.BenchmarkTree2-16 97269500 85137100 -14.25% garbage.BenchmarkParserPause 141101976 157746974 +11.80% garbage.BenchmarkParserPause-2 103096051 83043048 -19.45% garbage.BenchmarkParserPause-4 52153133 45951111 -11.89% garbage.BenchmarkParserPause-8 36730190 38901024 +5.91% garbage.BenchmarkParserPause-16 32678875 29578585 -9.49% garbage.BenchmarkTreePause 29487065 29648439 +0.55% garbage.BenchmarkTreePause-2 22443494 21306159 -5.07% garbage.BenchmarkTreePause-4 15799691 14985647 -5.15% garbage.BenchmarkTreePause-8 10768112 9531420 -12.97% garbage.BenchmarkTreePause-16 16329891 15205158 -6.89% garbage.BenchmarkTree2Pause 2586957240 2577533200 -0.36% garbage.BenchmarkTree2Pause-2 1683383760 1673923800 -0.56% garbage.BenchmarkTree2Pause-4 1102860320 1074040280 -2.68% garbage.BenchmarkTree2Pause-8 902627920 886122400 -1.86% garbage.BenchmarkTree2Pause-16 856470920 804152320 -6.50% garbage.BenchmarkParserLastPause 277316000 280839000 +1.25% garbage.BenchmarkParserLastPause-2 179446000 163687000 -8.78% garbage.BenchmarkParserLastPause-4 106752000 94144000 -11.81% garbage.BenchmarkParserLastPause-8 57758000 61640000 +6.72% garbage.BenchmarkParserLastPause-16 51235000 42552000 -16.95% garbage.BenchmarkTreeLastPause 45244000 50786000 +12.25% garbage.BenchmarkTreeLastPause-2 37163000 34654000 -6.75% garbage.BenchmarkTreeLastPause-4 24178000 21967000 -9.14% garbage.BenchmarkTreeLastPause-8 20390000 15648000 -30.30% garbage.BenchmarkTreeLastPause-16 22398000 20180000 -9.90% garbage.BenchmarkTree2LastPause 5748706000 5718809000 -0.52% garbage.BenchmarkTree2LastPause-2 3481570000 3458844000 -0.65% garbage.BenchmarkTree2LastPause-4 1849073000 1791330000 -3.22% garbage.BenchmarkTree2LastPause-8 1042375000 1034811000 -0.73% garbage.BenchmarkTree2LastPause-16 972637000 851323000 -14.25% There is also visible improvement in consumed CPU time: tree2 -heapsize=8000000000 -cpus=12 before: 248.74user 6.36system 0:52.74elapsed 483%CPU after: 229.86user 6.33system 0:51.08elapsed 462%CPU -1.66s of real time, but -18.91s of consumed CPU time R=golang-dev CC=golang-dev https://golang.org/cl/6215065 --- diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 9ceeeea05d..4064f6916b 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -120,10 +120,9 @@ static struct { uint32 nproc; volatile uint32 nwait; volatile uint32 ndone; + volatile uint32 debugmarkdone; Note alldone; - Lock markgate; - Lock sweepgate; - uint32 spanidx; + ParFor *sweepfor; Lock; byte *chunk; @@ -720,40 +719,10 @@ handlespecial(byte *p, uintptr size) return true; } -static void sweepspan(MSpan *s); - // Sweep frees or collects finalizers for blocks not marked in the mark phase. // It clears the mark bits in preparation for the next GC round. static void -sweep(void) -{ - MSpan *s, **allspans; - int64 now; - uint32 spanidx, nspan; - - now = runtime·nanotime(); - nspan = runtime·mheap.nspan; - allspans = runtime·mheap.allspans; - for(;;) { - spanidx = runtime·xadd(&work.spanidx, 1) - 1; - if(spanidx >= nspan) - break; - s = allspans[spanidx]; - - // Stamp newly unused spans. The scavenger will use that - // info to potentially give back some pages to the OS. - if(s->state == MSpanFree && s->unusedsince == 0) - s->unusedsince = now; - - if(s->state != MSpanInUse) - continue; - - sweepspan(s); - } -} - -static void -sweepspan(MSpan *s) +sweepspan(ParFor *desc, uint32 idx) { int32 cl, n, npages; uintptr size; @@ -762,7 +731,16 @@ sweepspan(MSpan *s) byte *arena_start; MLink *start, *end; int32 nfree; + MSpan *s; + USED(&desc); + s = runtime·mheap.allspans[idx]; + // Stamp newly unused spans. The scavenger will use that + // info to potentially give back some pages to the OS. + if(s->state == MSpanFree && s->unusedsince == 0) + s->unusedsince = runtime·nanotime(); + if(s->state != MSpanInUse) + return; arena_start = runtime·mheap.arena_start; p = (byte*)(s->start << PageShift); cl = s->sizeclass; @@ -847,16 +825,15 @@ sweepspan(MSpan *s) void runtime·gchelper(void) { - // Wait until main proc is ready for mark help. - runtime·lock(&work.markgate); - runtime·unlock(&work.markgate); scanblock(nil, 0); - // Wait until main proc is ready for sweep help. - runtime·lock(&work.sweepgate); - runtime·unlock(&work.sweepgate); - sweep(); + if(DebugMark) { + // wait while the main thread executes mark(debug_scanblock) + while(runtime·atomicload(&work.debugmarkdone) == 0) + runtime·usleep(10); + } + runtime·parfordo(work.sweepfor); if(runtime·xadd(&work.ndone, +1) == work.nproc-1) runtime·notewakeup(&work.alldone); } @@ -972,33 +949,38 @@ runtime·gc(int32 force) obj0 = mstats.nmalloc - mstats.nfree; } - runtime·lock(&work.markgate); - runtime·lock(&work.sweepgate); - + work.nwait = 0; + work.ndone = 0; + work.debugmarkdone = 0; work.nproc = runtime·gcprocs(); + if(work.sweepfor == nil) + work.sweepfor = runtime·parforalloc(MaxGcproc); + runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan); if(work.nproc > 1) { runtime·noteclear(&work.alldone); runtime·helpgc(work.nproc); } - work.nwait = 0; - work.ndone = 0; - runtime·unlock(&work.markgate); // let the helpers in mark(scanblock); - if(DebugMark) + if(DebugMark) { mark(debug_scanblock); + runtime·atomicstore(&work.debugmarkdone, 1); + } t1 = runtime·nanotime(); - work.spanidx = 0; - runtime·unlock(&work.sweepgate); // let the helpers in - sweep(); - if(work.nproc > 1) - runtime·notesleep(&work.alldone); + runtime·parfordo(work.sweepfor); t2 = runtime·nanotime(); stealcache(); cachestats(&stats); + if(work.nproc > 1) + runtime·notesleep(&work.alldone); + + stats.nprocyield += work.sweepfor->nprocyield; + stats.nosyield += work.sweepfor->nosyield; + stats.nsleep += work.sweepfor->nsleep; + mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100; m->gcing = 0; @@ -1027,20 +1009,21 @@ runtime·gc(int32 force) if(gctrace) { runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects," - " %D(%D) handoff, %D/%D/%D yields\n", + " %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n", mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000, heap0>>20, heap1>>20, obj0, obj1, mstats.nmalloc, mstats.nfree, stats.nhandoff, stats.nhandoffcnt, + work.sweepfor->nsteal, work.sweepfor->nstealcnt, stats.nprocyield, stats.nosyield, stats.nsleep); } - + runtime·MProf_GC(); runtime·semrelease(&runtime·worldsema); runtime·starttheworld(); - // give the queued finalizers, if any, a chance to run - if(finq != nil) + // give the queued finalizers, if any, a chance to run + if(finq != nil) runtime·gosched(); if(gctrace > 1 && !force)