From: Dmitriy Vyukov <dvyukov@google.com>
Date: Thu, 10 Jan 2013 05:57:06 +0000 (+0400)
Subject: runtime: less aggressive per-thread stack segment caching
X-Git-Tag: go1.1rc2~1430
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=f82db7d9e4ccf04b19a087561ab0f521fc36e5b1;p=gostls13.git

runtime: less aggressive per-thread stack segment caching
Introduce global stack segment cache and limit per-thread cache size.
This greatly reduces StackSys memory on workloads that create lots of threads.

benchmark                      old ns/op    new ns/op    delta
BenchmarkStackGrowth                 665          656   -1.35%
BenchmarkStackGrowth-2               333          328   -1.50%
BenchmarkStackGrowth-4               224          172  -23.21%
BenchmarkStackGrowth-8               124           91  -26.13%
BenchmarkStackGrowth-16               82           47  -41.94%
BenchmarkStackGrowth-32               73           40  -44.79%

BenchmarkStackGrowthDeep           97231        94391   -2.92%
BenchmarkStackGrowthDeep-2         47230        58562  +23.99%
BenchmarkStackGrowthDeep-4         24993        49356  +97.48%
BenchmarkStackGrowthDeep-8         15105        30072  +99.09%
BenchmarkStackGrowthDeep-16        10005        15623  +56.15%
BenchmarkStackGrowthDeep-32        12517        13069   +4.41%

TestStackMem#1,MB                  310          12       -96.13%
TestStackMem#2,MB                  296          14       -95.27%
TestStackMem#3,MB                  479          14       -97.08%

TestStackMem#1,sec                 3.22         2.26     -29.81%
TestStackMem#2,sec                 2.43         2.15     -11.52%
TestStackMem#3,sec                 2.50         2.38      -4.80%

R=sougou, no.smile.face, rsc
CC=golang-dev, msolomon
https://golang.org/cl/7029044
---

diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc
index e37f8927ba..847f51df7c 100644
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@@ -748,9 +748,74 @@ runtimeÂ·cnew(Type *typ)
 	return ret;
 }
 
+typedef struct StackCacheNode StackCacheNode;
+struct StackCacheNode
+{
+	StackCacheNode *next;
+	void*	batch[StackCacheBatch-1];
+};
+
+static StackCacheNode *stackcache;
+static Lock stackcachemu;
+
+// stackcacherefill/stackcacherelease implement global cache of stack segments.
+// The cache is required to prevent unlimited growth of per-thread caches.
+static void
+stackcacherefill(void)
+{
+	StackCacheNode *n;
+	int32 i, pos;
+
+	runtimeÂ·lock(&stackcachemu);
+	n = stackcache;
+	if(n)
+		stackcache = n->next;
+	runtimeÂ·unlock(&stackcachemu);
+	if(n == nil) {
+		n = (StackCacheNode*)runtimeÂ·SysAlloc(FixedStack*StackCacheBatch);
+		if(n == nil)
+			runtimeÂ·throw("out of memory (staccachekrefill)");
+		runtimeÂ·xadd64(&mstats.stacks_sys, FixedStack*StackCacheBatch);
+		for(i = 0; i < StackCacheBatch-1; i++)
+			n->batch[i] = (byte*)n + (i+1)*FixedStack;
+	}
+	pos = m->stackcachepos;
+	for(i = 0; i < StackCacheBatch-1; i++) {
+		m->stackcache[pos] = n->batch[i];
+		pos = (pos + 1) % StackCacheSize;
+	}
+	m->stackcache[pos] = n;
+	pos = (pos + 1) % StackCacheSize;
+	m->stackcachepos = pos;
+	m->stackcachecnt += StackCacheBatch;
+}
+
+static void
+stackcacherelease(void)
+{
+	StackCacheNode *n;
+	uint32 i, pos;
+
+	pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize;
+	n = (StackCacheNode*)m->stackcache[pos];
+	pos = (pos + 1) % StackCacheSize;
+	for(i = 0; i < StackCacheBatch-1; i++) {
+		n->batch[i] = m->stackcache[pos];
+		pos = (pos + 1) % StackCacheSize;
+	}
+	m->stackcachecnt -= StackCacheBatch;
+	runtimeÂ·lock(&stackcachemu);
+	n->next = stackcache;
+	stackcache = n;
+	runtimeÂ·unlock(&stackcachemu);
+}
+
 void*
 runtimeÂ·stackalloc(uint32 n)
 {
+	uint32 pos;
+	void *v;
+
 	// Stackalloc must be called on scheduler stack, so that we
 	// never try to grow the stack during the code that stackalloc runs.
 	// Doing so would cause a deadlock (issue 1547).
@@ -769,7 +834,15 @@ runtimeÂ·stackalloc(uint32 n)
 			runtimeÂ·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
 			runtimeÂ·throw("stackalloc");
 		}
-		return runtimeÂ·FixAlloc_Alloc(m->stackalloc);
+		if(m->stackcachecnt == 0)
+			stackcacherefill();
+		pos = m->stackcachepos;
+		pos = (pos - 1) % StackCacheSize;
+		v = m->stackcache[pos];
+		m->stackcachepos = pos;
+		m->stackcachecnt--;
+		m->stackinuse++;
+		return v;
 	}
 	return runtimeÂ·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
 }
@@ -777,8 +850,16 @@ runtimeÂ·stackalloc(uint32 n)
 void
 runtimeÂ·stackfree(void *v, uintptr n)
 {
+	uint32 pos;
+
 	if(m->mallocing || m->gcing || n == FixedStack) {
-		runtimeÂ·FixAlloc_Free(m->stackalloc, v);
+		if(m->stackcachecnt == StackCacheSize)
+			stackcacherelease();
+		pos = m->stackcachepos;
+		m->stackcache[pos] = v;
+		m->stackcachepos = (pos + 1) % StackCacheSize;
+		m->stackcachecnt++;
+		m->stackinuse--;
 		return;
 	}
 	runtimeÂ·free(v);
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index 86e879afe4..c7c12b49e8 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -1176,18 +1176,15 @@ cachestats(GCStats *stats)
 	MCache *c;
 	int32 i;
 	uint64 stacks_inuse;
-	uint64 stacks_sys;
 	uint64 *src, *dst;
 
 	if(stats)
 		runtimeÂ·memclr((byte*)stats, sizeof(*stats));
 	stacks_inuse = 0;
-	stacks_sys = 0;
 	for(mp=runtimeÂ·allm; mp; mp=mp->alllink) {
 		c = mp->mcache;
 		runtimeÂ·purgecachedstats(c);
-		stacks_inuse += mp->stackalloc->inuse;
-		stacks_sys += mp->stackalloc->sys;
+		stacks_inuse += mp->stackinuse*FixedStack;
 		if(stats) {
 			src = (uint64*)&mp->gcstats;
 			dst = (uint64*)stats;
@@ -1203,7 +1200,6 @@ cachestats(GCStats *stats)
 		}
 	}
 	mstats.stacks_inuse = stacks_inuse;
-	mstats.stacks_sys = stacks_sys;
 }
 
 // Structure of arguments passed to function gc().
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index 80e97795ab..eba0d6456b 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -345,8 +345,6 @@ mcommoninit(M *mp)
 {
 	mp->id = runtimeÂ·sched.mcount++;
 	mp->fastrand = 0x49f6428aUL + mp->id + runtimeÂ·cputicks();
-	mp->stackalloc = runtimeÂ·malloc(sizeof(*mp->stackalloc));
-	runtimeÂ·FixAlloc_Init(mp->stackalloc, FixedStack, runtimeÂ·SysAlloc, nil, nil);
 
 	if(mp->mcache == nil)
 		mp->mcache = runtimeÂ·allocmcache();
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go
index 1d51c5271e..0bbf9fa175 100644
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -53,7 +53,7 @@ func stackGrowthRecursive(i int) {
 	}
 }
 
-func BenchmarkStackGrowth(b *testing.B) {
+func benchmarkStackGrowth(b *testing.B, rec int) {
 	const CallsPerSched = 1000
 	procs := runtime.GOMAXPROCS(-1)
 	N := int32(b.N / CallsPerSched)
@@ -63,7 +63,7 @@ func BenchmarkStackGrowth(b *testing.B) {
 			for atomic.AddInt32(&N, -1) >= 0 {
 				runtime.Gosched()
 				for g := 0; g < CallsPerSched; g++ {
-					stackGrowthRecursive(10)
+					stackGrowthRecursive(rec)
 				}
 			}
 			c <- true
@@ -74,6 +74,14 @@ func BenchmarkStackGrowth(b *testing.B) {
 	}
 }
 
+func BenchmarkStackGrowth(b *testing.B) {
+	benchmarkStackGrowth(b, 10)
+}
+
+func BenchmarkStackGrowthDeep(b *testing.B) {
+	benchmarkStackGrowth(b, 1024)
+}
+
 func BenchmarkSyscall(b *testing.B) {
 	const CallsPerSched = 1000
 	procs := runtime.GOMAXPROCS(-1)
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index a228b06e32..47a7b6e78b 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -128,6 +128,13 @@ enum
 {
 	PtrSize = sizeof(void*),
 };
+enum
+{
+	// Per-M stack segment cache size.
+	StackCacheSize = 32,
+	// Global <-> per-M stack segment cache transfer batch size.
+	StackCacheBatch = 16,
+};
 
 /*
  * structures
@@ -262,7 +269,10 @@ struct	M
 	M*	schedlink;
 	uint32	machport;	// Return address for Mach IPC (OS X)
 	MCache	*mcache;
-	FixAlloc	*stackalloc;
+	int32	stackinuse;
+	uint32	stackcachepos;
+	uint32	stackcachecnt;
+	void*	stackcache[StackCacheSize];
 	G*	lockedg;
 	G*	idleg;
 	uintptr	createstack[32];	// Stack that created this thread.
diff --git a/src/pkg/runtime/stack_test.go b/src/pkg/runtime/stack_test.go
index 8b75e4d121..f04bddc764 100644
--- a/src/pkg/runtime/stack_test.go
+++ b/src/pkg/runtime/stack_test.go
@@ -34,6 +34,7 @@ package runtime_test
 import (
 	. "runtime"
 	"testing"
+	"time"
 	"unsafe"
 )
 
@@ -1526,3 +1527,51 @@ func stack4988() (uintptr, uintptr) { var buf [4988]byte; use(buf[:]); return St
 func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() }
 func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() }
 func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() }
+
+// TestStackMem measures per-thread stack segment cache behavior.
+// The test consumed up to 500MB in the past.
+func TestStackMem(t *testing.T) {
+	const (
+		BatchSize      = 32
+		BatchCount     = 512
+		ArraySize      = 1024
+		RecursionDepth = 128
+	)
+	if testing.Short() {
+		return
+	}
+	defer GOMAXPROCS(GOMAXPROCS(BatchSize))
+	s0 := new(MemStats)
+	ReadMemStats(s0)
+	for b := 0; b < BatchCount; b++ {
+		c := make(chan bool, BatchSize)
+		for i := 0; i < BatchSize; i++ {
+			go func() {
+				var f func(k int, a [ArraySize]byte)
+				f = func(k int, a [ArraySize]byte) {
+					if k == 0 {
+						time.Sleep(time.Millisecond)
+						return
+					}
+					f(k-1, a)
+				}
+				f(RecursionDepth, [ArraySize]byte{})
+				c <- true
+			}()
+		}
+		for i := 0; i < BatchSize; i++ {
+			<-c
+		}
+	}
+	s1 := new(MemStats)
+	ReadMemStats(s1)
+	consumed := s1.StackSys - s0.StackSys
+	t.Logf("Consumed %vMB for stack mem", consumed>>20)
+	estimate := uint64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness.
+	if consumed > estimate {
+		t.Fatalf("Stack mem: want %v, got %v", estimate, consumed)
+	}
+	if s1.StackInuse > 1<<20 {
+		t.Fatalf("Stack inuse: want %v, got %v", 1<<20, s1.StackInuse)
+	}
+}