From: Dmitriy Vyukov <dvyukov@google.com>
Date: Tue, 22 May 2012 17:35:52 +0000 (-0400)
Subject: runtime: faster GC sweep phase
X-Git-Tag: go1.1rc2~3152
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=845aa1fc2c86d761a96d31de5e168d2a0f76f0da;p=gostls13.git

runtime: faster GC sweep phase
benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065
---

diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index 9ceeeea05d..4064f6916b 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -120,10 +120,9 @@ static struct {
 	uint32	nproc;
 	volatile uint32	nwait;
 	volatile uint32	ndone;
+	volatile uint32 debugmarkdone;
 	Note	alldone;
-	Lock	markgate;
-	Lock	sweepgate;
-	uint32	spanidx;
+	ParFor	*sweepfor;
 
 	Lock;
 	byte	*chunk;
@@ -720,40 +719,10 @@ handlespecial(byte *p, uintptr size)
 	return true;
 }
 
-static void sweepspan(MSpan *s);
-
 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
 static void
-sweep(void)
-{
-	MSpan *s, **allspans;
-	int64 now;
-	uint32 spanidx, nspan;
-
-	now = runtimeÂ·nanotime();
-	nspan = runtimeÂ·mheap.nspan;
-	allspans = runtimeÂ·mheap.allspans;
-	for(;;) {
-		spanidx = runtimeÂ·xadd(&work.spanidx, 1) - 1;
-		if(spanidx >= nspan)
-			break;
-		s = allspans[spanidx];
-
-		// Stamp newly unused spans. The scavenger will use that
-		// info to potentially give back some pages to the OS.
-		if(s->state == MSpanFree && s->unusedsince == 0)
-			s->unusedsince = now;
-
-		if(s->state != MSpanInUse)
-			continue;
-
-		sweepspan(s);
-	}
-}
-
-static void
-sweepspan(MSpan *s)
+sweepspan(ParFor *desc, uint32 idx)
 {
 	int32 cl, n, npages;
 	uintptr size;
@@ -762,7 +731,16 @@ sweepspan(MSpan *s)
 	byte *arena_start;
 	MLink *start, *end;
 	int32 nfree;
+	MSpan *s;
 
+	USED(&desc);
+	s = runtimeÂ·mheap.allspans[idx];
+	// Stamp newly unused spans. The scavenger will use that
+	// info to potentially give back some pages to the OS.
+	if(s->state == MSpanFree && s->unusedsince == 0)
+		s->unusedsince = runtimeÂ·nanotime();
+	if(s->state != MSpanInUse)
+		return;
 	arena_start = runtimeÂ·mheap.arena_start;
 	p = (byte*)(s->start << PageShift);
 	cl = s->sizeclass;
@@ -847,16 +825,15 @@ sweepspan(MSpan *s)
 void
 runtimeÂ·gchelper(void)
 {
-	// Wait until main proc is ready for mark help.
-	runtimeÂ·lock(&work.markgate);
-	runtimeÂ·unlock(&work.markgate);
 	scanblock(nil, 0);
 
-	// Wait until main proc is ready for sweep help.
-	runtimeÂ·lock(&work.sweepgate);
-	runtimeÂ·unlock(&work.sweepgate);
-	sweep();
+	if(DebugMark) {
+		// wait while the main thread executes mark(debug_scanblock)
+		while(runtimeÂ·atomicload(&work.debugmarkdone) == 0)
+			runtimeÂ·usleep(10);
+	}
 
+	runtimeÂ·parfordo(work.sweepfor);
 	if(runtimeÂ·xadd(&work.ndone, +1) == work.nproc-1)
 		runtimeÂ·notewakeup(&work.alldone);
 }
@@ -972,33 +949,38 @@ runtimeÂ·gc(int32 force)
 		obj0 = mstats.nmalloc - mstats.nfree;
 	}
 
-	runtimeÂ·lock(&work.markgate);
-	runtimeÂ·lock(&work.sweepgate);
-
+	work.nwait = 0;
+	work.ndone = 0;
+	work.debugmarkdone = 0;
 	work.nproc = runtimeÂ·gcprocs();
+	if(work.sweepfor == nil)
+		work.sweepfor = runtimeÂ·parforalloc(MaxGcproc);
+	runtimeÂ·parforsetup(work.sweepfor, work.nproc, runtimeÂ·mheap.nspan, nil, true, sweepspan);
 	if(work.nproc > 1) {
 		runtimeÂ·noteclear(&work.alldone);
 		runtimeÂ·helpgc(work.nproc);
 	}
-	work.nwait = 0;
-	work.ndone = 0;
 
-	runtimeÂ·unlock(&work.markgate);  // let the helpers in
 	mark(scanblock);
-	if(DebugMark)
+	if(DebugMark) {
 		mark(debug_scanblock);
+		runtimeÂ·atomicstore(&work.debugmarkdone, 1);
+	}
 	t1 = runtimeÂ·nanotime();
 
-	work.spanidx = 0;
-	runtimeÂ·unlock(&work.sweepgate);  // let the helpers in
-	sweep();
-	if(work.nproc > 1)
-		runtimeÂ·notesleep(&work.alldone);
+	runtimeÂ·parfordo(work.sweepfor);
 	t2 = runtimeÂ·nanotime();
 
 	stealcache();
 	cachestats(&stats);
 
+	if(work.nproc > 1)
+		runtimeÂ·notesleep(&work.alldone);
+
+	stats.nprocyield += work.sweepfor->nprocyield;
+	stats.nosyield += work.sweepfor->nosyield;
+	stats.nsleep += work.sweepfor->nsleep;
+
 	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
 	m->gcing = 0;
 
@@ -1027,20 +1009,21 @@ runtimeÂ·gc(int32 force)
 
 	if(gctrace) {
 		runtimeÂ·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
-				" %D(%D) handoff, %D/%D/%D yields\n",
+				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
 			mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
 			heap0>>20, heap1>>20, obj0, obj1,
 			mstats.nmalloc, mstats.nfree,
 			stats.nhandoff, stats.nhandoffcnt,
+			work.sweepfor->nsteal, work.sweepfor->nstealcnt,
 			stats.nprocyield, stats.nosyield, stats.nsleep);
 	}
-	
+
 	runtimeÂ·MProf_GC();
 	runtimeÂ·semrelease(&runtimeÂ·worldsema);
 	runtimeÂ·starttheworld();
 
-	// give the queued finalizers, if any, a chance to run	
-	if(finq != nil)	
+	// give the queued finalizers, if any, a chance to run
+	if(finq != nil)
 		runtimeÂ·gosched();
 
 	if(gctrace > 1 && !force)