From: Dmitriy Vyukov Date: Thu, 10 Jan 2013 05:57:06 +0000 (+0400) Subject: runtime: less aggressive per-thread stack segment caching X-Git-Tag: go1.1rc2~1430 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=f82db7d9e4ccf04b19a087561ab0f521fc36e5b1;p=gostls13.git runtime: less aggressive per-thread stack segment caching Introduce global stack segment cache and limit per-thread cache size. This greatly reduces StackSys memory on workloads that create lots of threads. benchmark old ns/op new ns/op delta BenchmarkStackGrowth 665 656 -1.35% BenchmarkStackGrowth-2 333 328 -1.50% BenchmarkStackGrowth-4 224 172 -23.21% BenchmarkStackGrowth-8 124 91 -26.13% BenchmarkStackGrowth-16 82 47 -41.94% BenchmarkStackGrowth-32 73 40 -44.79% BenchmarkStackGrowthDeep 97231 94391 -2.92% BenchmarkStackGrowthDeep-2 47230 58562 +23.99% BenchmarkStackGrowthDeep-4 24993 49356 +97.48% BenchmarkStackGrowthDeep-8 15105 30072 +99.09% BenchmarkStackGrowthDeep-16 10005 15623 +56.15% BenchmarkStackGrowthDeep-32 12517 13069 +4.41% TestStackMem#1,MB 310 12 -96.13% TestStackMem#2,MB 296 14 -95.27% TestStackMem#3,MB 479 14 -97.08% TestStackMem#1,sec 3.22 2.26 -29.81% TestStackMem#2,sec 2.43 2.15 -11.52% TestStackMem#3,sec 2.50 2.38 -4.80% R=sougou, no.smile.face, rsc CC=golang-dev, msolomon https://golang.org/cl/7029044 --- diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc index e37f8927ba..847f51df7c 100644 --- a/src/pkg/runtime/malloc.goc +++ b/src/pkg/runtime/malloc.goc @@ -748,9 +748,74 @@ runtime·cnew(Type *typ) return ret; } +typedef struct StackCacheNode StackCacheNode; +struct StackCacheNode +{ + StackCacheNode *next; + void* batch[StackCacheBatch-1]; +}; + +static StackCacheNode *stackcache; +static Lock stackcachemu; + +// stackcacherefill/stackcacherelease implement global cache of stack segments. +// The cache is required to prevent unlimited growth of per-thread caches. +static void +stackcacherefill(void) +{ + StackCacheNode *n; + int32 i, pos; + + runtime·lock(&stackcachemu); + n = stackcache; + if(n) + stackcache = n->next; + runtime·unlock(&stackcachemu); + if(n == nil) { + n = (StackCacheNode*)runtime·SysAlloc(FixedStack*StackCacheBatch); + if(n == nil) + runtime·throw("out of memory (staccachekrefill)"); + runtime·xadd64(&mstats.stacks_sys, FixedStack*StackCacheBatch); + for(i = 0; i < StackCacheBatch-1; i++) + n->batch[i] = (byte*)n + (i+1)*FixedStack; + } + pos = m->stackcachepos; + for(i = 0; i < StackCacheBatch-1; i++) { + m->stackcache[pos] = n->batch[i]; + pos = (pos + 1) % StackCacheSize; + } + m->stackcache[pos] = n; + pos = (pos + 1) % StackCacheSize; + m->stackcachepos = pos; + m->stackcachecnt += StackCacheBatch; +} + +static void +stackcacherelease(void) +{ + StackCacheNode *n; + uint32 i, pos; + + pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize; + n = (StackCacheNode*)m->stackcache[pos]; + pos = (pos + 1) % StackCacheSize; + for(i = 0; i < StackCacheBatch-1; i++) { + n->batch[i] = m->stackcache[pos]; + pos = (pos + 1) % StackCacheSize; + } + m->stackcachecnt -= StackCacheBatch; + runtime·lock(&stackcachemu); + n->next = stackcache; + stackcache = n; + runtime·unlock(&stackcachemu); +} + void* runtime·stackalloc(uint32 n) { + uint32 pos; + void *v; + // Stackalloc must be called on scheduler stack, so that we // never try to grow the stack during the code that stackalloc runs. // Doing so would cause a deadlock (issue 1547). @@ -769,7 +834,15 @@ runtime·stackalloc(uint32 n) runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n); runtime·throw("stackalloc"); } - return runtime·FixAlloc_Alloc(m->stackalloc); + if(m->stackcachecnt == 0) + stackcacherefill(); + pos = m->stackcachepos; + pos = (pos - 1) % StackCacheSize; + v = m->stackcache[pos]; + m->stackcachepos = pos; + m->stackcachecnt--; + m->stackinuse++; + return v; } return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0); } @@ -777,8 +850,16 @@ runtime·stackalloc(uint32 n) void runtime·stackfree(void *v, uintptr n) { + uint32 pos; + if(m->mallocing || m->gcing || n == FixedStack) { - runtime·FixAlloc_Free(m->stackalloc, v); + if(m->stackcachecnt == StackCacheSize) + stackcacherelease(); + pos = m->stackcachepos; + m->stackcache[pos] = v; + m->stackcachepos = (pos + 1) % StackCacheSize; + m->stackcachecnt++; + m->stackinuse--; return; } runtime·free(v); diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 86e879afe4..c7c12b49e8 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -1176,18 +1176,15 @@ cachestats(GCStats *stats) MCache *c; int32 i; uint64 stacks_inuse; - uint64 stacks_sys; uint64 *src, *dst; if(stats) runtime·memclr((byte*)stats, sizeof(*stats)); stacks_inuse = 0; - stacks_sys = 0; for(mp=runtime·allm; mp; mp=mp->alllink) { c = mp->mcache; runtime·purgecachedstats(c); - stacks_inuse += mp->stackalloc->inuse; - stacks_sys += mp->stackalloc->sys; + stacks_inuse += mp->stackinuse*FixedStack; if(stats) { src = (uint64*)&mp->gcstats; dst = (uint64*)stats; @@ -1203,7 +1200,6 @@ cachestats(GCStats *stats) } } mstats.stacks_inuse = stacks_inuse; - mstats.stacks_sys = stacks_sys; } // Structure of arguments passed to function gc(). diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c index 80e97795ab..eba0d6456b 100644 --- a/src/pkg/runtime/proc.c +++ b/src/pkg/runtime/proc.c @@ -345,8 +345,6 @@ mcommoninit(M *mp) { mp->id = runtime·sched.mcount++; mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks(); - mp->stackalloc = runtime·malloc(sizeof(*mp->stackalloc)); - runtime·FixAlloc_Init(mp->stackalloc, FixedStack, runtime·SysAlloc, nil, nil); if(mp->mcache == nil) mp->mcache = runtime·allocmcache(); diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go index 1d51c5271e..0bbf9fa175 100644 --- a/src/pkg/runtime/proc_test.go +++ b/src/pkg/runtime/proc_test.go @@ -53,7 +53,7 @@ func stackGrowthRecursive(i int) { } } -func BenchmarkStackGrowth(b *testing.B) { +func benchmarkStackGrowth(b *testing.B, rec int) { const CallsPerSched = 1000 procs := runtime.GOMAXPROCS(-1) N := int32(b.N / CallsPerSched) @@ -63,7 +63,7 @@ func BenchmarkStackGrowth(b *testing.B) { for atomic.AddInt32(&N, -1) >= 0 { runtime.Gosched() for g := 0; g < CallsPerSched; g++ { - stackGrowthRecursive(10) + stackGrowthRecursive(rec) } } c <- true @@ -74,6 +74,14 @@ func BenchmarkStackGrowth(b *testing.B) { } } +func BenchmarkStackGrowth(b *testing.B) { + benchmarkStackGrowth(b, 10) +} + +func BenchmarkStackGrowthDeep(b *testing.B) { + benchmarkStackGrowth(b, 1024) +} + func BenchmarkSyscall(b *testing.B) { const CallsPerSched = 1000 procs := runtime.GOMAXPROCS(-1) diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h index a228b06e32..47a7b6e78b 100644 --- a/src/pkg/runtime/runtime.h +++ b/src/pkg/runtime/runtime.h @@ -128,6 +128,13 @@ enum { PtrSize = sizeof(void*), }; +enum +{ + // Per-M stack segment cache size. + StackCacheSize = 32, + // Global <-> per-M stack segment cache transfer batch size. + StackCacheBatch = 16, +}; /* * structures @@ -262,7 +269,10 @@ struct M M* schedlink; uint32 machport; // Return address for Mach IPC (OS X) MCache *mcache; - FixAlloc *stackalloc; + int32 stackinuse; + uint32 stackcachepos; + uint32 stackcachecnt; + void* stackcache[StackCacheSize]; G* lockedg; G* idleg; uintptr createstack[32]; // Stack that created this thread. diff --git a/src/pkg/runtime/stack_test.go b/src/pkg/runtime/stack_test.go index 8b75e4d121..f04bddc764 100644 --- a/src/pkg/runtime/stack_test.go +++ b/src/pkg/runtime/stack_test.go @@ -34,6 +34,7 @@ package runtime_test import ( . "runtime" "testing" + "time" "unsafe" ) @@ -1526,3 +1527,51 @@ func stack4988() (uintptr, uintptr) { var buf [4988]byte; use(buf[:]); return St func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() } func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() } func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() } + +// TestStackMem measures per-thread stack segment cache behavior. +// The test consumed up to 500MB in the past. +func TestStackMem(t *testing.T) { + const ( + BatchSize = 32 + BatchCount = 512 + ArraySize = 1024 + RecursionDepth = 128 + ) + if testing.Short() { + return + } + defer GOMAXPROCS(GOMAXPROCS(BatchSize)) + s0 := new(MemStats) + ReadMemStats(s0) + for b := 0; b < BatchCount; b++ { + c := make(chan bool, BatchSize) + for i := 0; i < BatchSize; i++ { + go func() { + var f func(k int, a [ArraySize]byte) + f = func(k int, a [ArraySize]byte) { + if k == 0 { + time.Sleep(time.Millisecond) + return + } + f(k-1, a) + } + f(RecursionDepth, [ArraySize]byte{}) + c <- true + }() + } + for i := 0; i < BatchSize; i++ { + <-c + } + } + s1 := new(MemStats) + ReadMemStats(s1) + consumed := s1.StackSys - s0.StackSys + t.Logf("Consumed %vMB for stack mem", consumed>>20) + estimate := uint64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness. + if consumed > estimate { + t.Fatalf("Stack mem: want %v, got %v", estimate, consumed) + } + if s1.StackInuse > 1<<20 { + t.Fatalf("Stack inuse: want %v, got %v", 1<<20, s1.StackInuse) + } +}