Introduce global stack segment cache and limit per-thread cache size.
This greatly reduces StackSys memory on workloads that create lots of threads.
benchmark old ns/op new ns/op delta
BenchmarkStackGrowth 665 656 -1.35%
BenchmarkStackGrowth-2 333 328 -1.50%
BenchmarkStackGrowth-4 224 172 -23.21%
BenchmarkStackGrowth-8 124 91 -26.13%
BenchmarkStackGrowth-16 82 47 -41.94%
BenchmarkStackGrowth-32 73 40 -44.79%
BenchmarkStackGrowthDeep 97231 94391 -2.92%
BenchmarkStackGrowthDeep-2 47230 58562 +23.99%
BenchmarkStackGrowthDeep-4 24993 49356 +97.48%
BenchmarkStackGrowthDeep-8 15105 30072 +99.09%
BenchmarkStackGrowthDeep-16 10005 15623 +56.15%
BenchmarkStackGrowthDeep-32 12517 13069 +4.41%
TestStackMem#1,MB 310 12 -96.13%
TestStackMem#2,MB 296 14 -95.27%
TestStackMem#3,MB 479 14 -97.08%
TestStackMem#1,sec 3.22 2.26 -29.81%
TestStackMem#2,sec 2.43 2.15 -11.52%
TestStackMem#3,sec 2.50 2.38 -4.80%
R=sougou, no.smile.face, rsc
CC=golang-dev, msolomon
https://golang.org/cl/
7029044
return ret;
}
+typedef struct StackCacheNode StackCacheNode;
+struct StackCacheNode
+{
+ StackCacheNode *next;
+ void* batch[StackCacheBatch-1];
+};
+
+static StackCacheNode *stackcache;
+static Lock stackcachemu;
+
+// stackcacherefill/stackcacherelease implement global cache of stack segments.
+// The cache is required to prevent unlimited growth of per-thread caches.
+static void
+stackcacherefill(void)
+{
+ StackCacheNode *n;
+ int32 i, pos;
+
+ runtime·lock(&stackcachemu);
+ n = stackcache;
+ if(n)
+ stackcache = n->next;
+ runtime·unlock(&stackcachemu);
+ if(n == nil) {
+ n = (StackCacheNode*)runtime·SysAlloc(FixedStack*StackCacheBatch);
+ if(n == nil)
+ runtime·throw("out of memory (staccachekrefill)");
+ runtime·xadd64(&mstats.stacks_sys, FixedStack*StackCacheBatch);
+ for(i = 0; i < StackCacheBatch-1; i++)
+ n->batch[i] = (byte*)n + (i+1)*FixedStack;
+ }
+ pos = m->stackcachepos;
+ for(i = 0; i < StackCacheBatch-1; i++) {
+ m->stackcache[pos] = n->batch[i];
+ pos = (pos + 1) % StackCacheSize;
+ }
+ m->stackcache[pos] = n;
+ pos = (pos + 1) % StackCacheSize;
+ m->stackcachepos = pos;
+ m->stackcachecnt += StackCacheBatch;
+}
+
+static void
+stackcacherelease(void)
+{
+ StackCacheNode *n;
+ uint32 i, pos;
+
+ pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize;
+ n = (StackCacheNode*)m->stackcache[pos];
+ pos = (pos + 1) % StackCacheSize;
+ for(i = 0; i < StackCacheBatch-1; i++) {
+ n->batch[i] = m->stackcache[pos];
+ pos = (pos + 1) % StackCacheSize;
+ }
+ m->stackcachecnt -= StackCacheBatch;
+ runtime·lock(&stackcachemu);
+ n->next = stackcache;
+ stackcache = n;
+ runtime·unlock(&stackcachemu);
+}
+
void*
runtime·stackalloc(uint32 n)
{
+ uint32 pos;
+ void *v;
+
// Stackalloc must be called on scheduler stack, so that we
// never try to grow the stack during the code that stackalloc runs.
// Doing so would cause a deadlock (issue 1547).
runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
runtime·throw("stackalloc");
}
- return runtime·FixAlloc_Alloc(m->stackalloc);
+ if(m->stackcachecnt == 0)
+ stackcacherefill();
+ pos = m->stackcachepos;
+ pos = (pos - 1) % StackCacheSize;
+ v = m->stackcache[pos];
+ m->stackcachepos = pos;
+ m->stackcachecnt--;
+ m->stackinuse++;
+ return v;
}
return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
}
void
runtime·stackfree(void *v, uintptr n)
{
+ uint32 pos;
+
if(m->mallocing || m->gcing || n == FixedStack) {
- runtime·FixAlloc_Free(m->stackalloc, v);
+ if(m->stackcachecnt == StackCacheSize)
+ stackcacherelease();
+ pos = m->stackcachepos;
+ m->stackcache[pos] = v;
+ m->stackcachepos = (pos + 1) % StackCacheSize;
+ m->stackcachecnt++;
+ m->stackinuse--;
return;
}
runtime·free(v);
MCache *c;
int32 i;
uint64 stacks_inuse;
- uint64 stacks_sys;
uint64 *src, *dst;
if(stats)
runtime·memclr((byte*)stats, sizeof(*stats));
stacks_inuse = 0;
- stacks_sys = 0;
for(mp=runtime·allm; mp; mp=mp->alllink) {
c = mp->mcache;
runtime·purgecachedstats(c);
- stacks_inuse += mp->stackalloc->inuse;
- stacks_sys += mp->stackalloc->sys;
+ stacks_inuse += mp->stackinuse*FixedStack;
if(stats) {
src = (uint64*)&mp->gcstats;
dst = (uint64*)stats;
}
}
mstats.stacks_inuse = stacks_inuse;
- mstats.stacks_sys = stacks_sys;
}
// Structure of arguments passed to function gc().
{
mp->id = runtime·sched.mcount++;
mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
- mp->stackalloc = runtime·malloc(sizeof(*mp->stackalloc));
- runtime·FixAlloc_Init(mp->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
if(mp->mcache == nil)
mp->mcache = runtime·allocmcache();
}
}
-func BenchmarkStackGrowth(b *testing.B) {
+func benchmarkStackGrowth(b *testing.B, rec int) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
N := int32(b.N / CallsPerSched)
for atomic.AddInt32(&N, -1) >= 0 {
runtime.Gosched()
for g := 0; g < CallsPerSched; g++ {
- stackGrowthRecursive(10)
+ stackGrowthRecursive(rec)
}
}
c <- true
}
}
+func BenchmarkStackGrowth(b *testing.B) {
+ benchmarkStackGrowth(b, 10)
+}
+
+func BenchmarkStackGrowthDeep(b *testing.B) {
+ benchmarkStackGrowth(b, 1024)
+}
+
func BenchmarkSyscall(b *testing.B) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
{
PtrSize = sizeof(void*),
};
+enum
+{
+ // Per-M stack segment cache size.
+ StackCacheSize = 32,
+ // Global <-> per-M stack segment cache transfer batch size.
+ StackCacheBatch = 16,
+};
/*
* structures
M* schedlink;
uint32 machport; // Return address for Mach IPC (OS X)
MCache *mcache;
- FixAlloc *stackalloc;
+ int32 stackinuse;
+ uint32 stackcachepos;
+ uint32 stackcachecnt;
+ void* stackcache[StackCacheSize];
G* lockedg;
G* idleg;
uintptr createstack[32]; // Stack that created this thread.
import (
. "runtime"
"testing"
+ "time"
"unsafe"
)
func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() }
func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() }
func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() }
+
+// TestStackMem measures per-thread stack segment cache behavior.
+// The test consumed up to 500MB in the past.
+func TestStackMem(t *testing.T) {
+ const (
+ BatchSize = 32
+ BatchCount = 512
+ ArraySize = 1024
+ RecursionDepth = 128
+ )
+ if testing.Short() {
+ return
+ }
+ defer GOMAXPROCS(GOMAXPROCS(BatchSize))
+ s0 := new(MemStats)
+ ReadMemStats(s0)
+ for b := 0; b < BatchCount; b++ {
+ c := make(chan bool, BatchSize)
+ for i := 0; i < BatchSize; i++ {
+ go func() {
+ var f func(k int, a [ArraySize]byte)
+ f = func(k int, a [ArraySize]byte) {
+ if k == 0 {
+ time.Sleep(time.Millisecond)
+ return
+ }
+ f(k-1, a)
+ }
+ f(RecursionDepth, [ArraySize]byte{})
+ c <- true
+ }()
+ }
+ for i := 0; i < BatchSize; i++ {
+ <-c
+ }
+ }
+ s1 := new(MemStats)
+ ReadMemStats(s1)
+ consumed := s1.StackSys - s0.StackSys
+ t.Logf("Consumed %vMB for stack mem", consumed>>20)
+ estimate := uint64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness.
+ if consumed > estimate {
+ t.Fatalf("Stack mem: want %v, got %v", estimate, consumed)
+ }
+ if s1.StackInuse > 1<<20 {
+ t.Fatalf("Stack inuse: want %v, got %v", 1<<20, s1.StackInuse)
+ }
+}