]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: faster GC sweep phase
authorDmitriy Vyukov <dvyukov@google.com>
Tue, 22 May 2012 17:35:52 +0000 (13:35 -0400)
committerRuss Cox <rsc@golang.org>
Tue, 22 May 2012 17:35:52 +0000 (13:35 -0400)
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065

src/pkg/runtime/mgc0.c

index 9ceeeea05d70ecdfcff16caeeeededb20a634155..4064f6916b48982ffa88fcba3c650d41a229d76b 100644 (file)
@@ -120,10 +120,9 @@ static struct {
        uint32  nproc;
        volatile uint32 nwait;
        volatile uint32 ndone;
+       volatile uint32 debugmarkdone;
        Note    alldone;
-       Lock    markgate;
-       Lock    sweepgate;
-       uint32  spanidx;
+       ParFor  *sweepfor;
 
        Lock;
        byte    *chunk;
@@ -720,40 +719,10 @@ handlespecial(byte *p, uintptr size)
        return true;
 }
 
-static void sweepspan(MSpan *s);
-
 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
 static void
-sweep(void)
-{
-       MSpan *s, **allspans;
-       int64 now;
-       uint32 spanidx, nspan;
-
-       now = runtime·nanotime();
-       nspan = runtime·mheap.nspan;
-       allspans = runtime·mheap.allspans;
-       for(;;) {
-               spanidx = runtime·xadd(&work.spanidx, 1) - 1;
-               if(spanidx >= nspan)
-                       break;
-               s = allspans[spanidx];
-
-               // Stamp newly unused spans. The scavenger will use that
-               // info to potentially give back some pages to the OS.
-               if(s->state == MSpanFree && s->unusedsince == 0)
-                       s->unusedsince = now;
-
-               if(s->state != MSpanInUse)
-                       continue;
-
-               sweepspan(s);
-       }
-}
-
-static void
-sweepspan(MSpan *s)
+sweepspan(ParFor *desc, uint32 idx)
 {
        int32 cl, n, npages;
        uintptr size;
@@ -762,7 +731,16 @@ sweepspan(MSpan *s)
        byte *arena_start;
        MLink *start, *end;
        int32 nfree;
+       MSpan *s;
 
+       USED(&desc);
+       s = runtime·mheap.allspans[idx];
+       // Stamp newly unused spans. The scavenger will use that
+       // info to potentially give back some pages to the OS.
+       if(s->state == MSpanFree && s->unusedsince == 0)
+               s->unusedsince = runtime·nanotime();
+       if(s->state != MSpanInUse)
+               return;
        arena_start = runtime·mheap.arena_start;
        p = (byte*)(s->start << PageShift);
        cl = s->sizeclass;
@@ -847,16 +825,15 @@ sweepspan(MSpan *s)
 void
 runtime·gchelper(void)
 {
-       // Wait until main proc is ready for mark help.
-       runtime·lock(&work.markgate);
-       runtime·unlock(&work.markgate);
        scanblock(nil, 0);
 
-       // Wait until main proc is ready for sweep help.
-       runtime·lock(&work.sweepgate);
-       runtime·unlock(&work.sweepgate);
-       sweep();
+       if(DebugMark) {
+               // wait while the main thread executes mark(debug_scanblock)
+               while(runtime·atomicload(&work.debugmarkdone) == 0)
+                       runtime·usleep(10);
+       }
 
+       runtime·parfordo(work.sweepfor);
        if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
                runtime·notewakeup(&work.alldone);
 }
@@ -972,33 +949,38 @@ runtime·gc(int32 force)
                obj0 = mstats.nmalloc - mstats.nfree;
        }
 
-       runtime·lock(&work.markgate);
-       runtime·lock(&work.sweepgate);
-
+       work.nwait = 0;
+       work.ndone = 0;
+       work.debugmarkdone = 0;
        work.nproc = runtime·gcprocs();
+       if(work.sweepfor == nil)
+               work.sweepfor = runtime·parforalloc(MaxGcproc);
+       runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
        if(work.nproc > 1) {
                runtime·noteclear(&work.alldone);
                runtime·helpgc(work.nproc);
        }
-       work.nwait = 0;
-       work.ndone = 0;
 
-       runtime·unlock(&work.markgate);  // let the helpers in
        mark(scanblock);
-       if(DebugMark)
+       if(DebugMark) {
                mark(debug_scanblock);
+               runtime·atomicstore(&work.debugmarkdone, 1);
+       }
        t1 = runtime·nanotime();
 
-       work.spanidx = 0;
-       runtime·unlock(&work.sweepgate);  // let the helpers in
-       sweep();
-       if(work.nproc > 1)
-               runtime·notesleep(&work.alldone);
+       runtime·parfordo(work.sweepfor);
        t2 = runtime·nanotime();
 
        stealcache();
        cachestats(&stats);
 
+       if(work.nproc > 1)
+               runtime·notesleep(&work.alldone);
+
+       stats.nprocyield += work.sweepfor->nprocyield;
+       stats.nosyield += work.sweepfor->nosyield;
+       stats.nsleep += work.sweepfor->nsleep;
+
        mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
        m->gcing = 0;
 
@@ -1027,20 +1009,21 @@ runtime·gc(int32 force)
 
        if(gctrace) {
                runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
-                               " %D(%D) handoff, %D/%D/%D yields\n",
+                               " %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
                        mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
                        heap0>>20, heap1>>20, obj0, obj1,
                        mstats.nmalloc, mstats.nfree,
                        stats.nhandoff, stats.nhandoffcnt,
+                       work.sweepfor->nsteal, work.sweepfor->nstealcnt,
                        stats.nprocyield, stats.nosyield, stats.nsleep);
        }
-       
+
        runtime·MProf_GC();
        runtime·semrelease(&runtime·worldsema);
        runtime·starttheworld();
 
-       // give the queued finalizers, if any, a chance to run  
-       if(finq != nil) 
+       // give the queued finalizers, if any, a chance to run
+       if(finq != nil)
                runtime·gosched();
 
        if(gctrace > 1 && !force)