]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: less aggressive per-thread stack segment caching
authorDmitriy Vyukov <dvyukov@google.com>
Thu, 10 Jan 2013 05:57:06 +0000 (09:57 +0400)
committerDmitriy Vyukov <dvyukov@google.com>
Thu, 10 Jan 2013 05:57:06 +0000 (09:57 +0400)
Introduce global stack segment cache and limit per-thread cache size.
This greatly reduces StackSys memory on workloads that create lots of threads.

benchmark                      old ns/op    new ns/op    delta
BenchmarkStackGrowth                 665          656   -1.35%
BenchmarkStackGrowth-2               333          328   -1.50%
BenchmarkStackGrowth-4               224          172  -23.21%
BenchmarkStackGrowth-8               124           91  -26.13%
BenchmarkStackGrowth-16               82           47  -41.94%
BenchmarkStackGrowth-32               73           40  -44.79%

BenchmarkStackGrowthDeep           97231        94391   -2.92%
BenchmarkStackGrowthDeep-2         47230        58562  +23.99%
BenchmarkStackGrowthDeep-4         24993        49356  +97.48%
BenchmarkStackGrowthDeep-8         15105        30072  +99.09%
BenchmarkStackGrowthDeep-16        10005        15623  +56.15%
BenchmarkStackGrowthDeep-32        12517        13069   +4.41%

TestStackMem#1,MB                  310          12       -96.13%
TestStackMem#2,MB                  296          14       -95.27%
TestStackMem#3,MB                  479          14       -97.08%

TestStackMem#1,sec                 3.22         2.26     -29.81%
TestStackMem#2,sec                 2.43         2.15     -11.52%
TestStackMem#3,sec                 2.50         2.38      -4.80%

R=sougou, no.smile.face, rsc
CC=golang-dev, msolomon
https://golang.org/cl/7029044

src/pkg/runtime/malloc.goc
src/pkg/runtime/mgc0.c
src/pkg/runtime/proc.c
src/pkg/runtime/proc_test.go
src/pkg/runtime/runtime.h
src/pkg/runtime/stack_test.go

index e37f8927ba8ff7c5dcc84ec2ae7fa4447c522117..847f51df7cfb468edbf830a1cdec6debc75c0d42 100644 (file)
@@ -748,9 +748,74 @@ runtime·cnew(Type *typ)
        return ret;
 }
 
+typedef struct StackCacheNode StackCacheNode;
+struct StackCacheNode
+{
+       StackCacheNode *next;
+       void*   batch[StackCacheBatch-1];
+};
+
+static StackCacheNode *stackcache;
+static Lock stackcachemu;
+
+// stackcacherefill/stackcacherelease implement global cache of stack segments.
+// The cache is required to prevent unlimited growth of per-thread caches.
+static void
+stackcacherefill(void)
+{
+       StackCacheNode *n;
+       int32 i, pos;
+
+       runtime·lock(&stackcachemu);
+       n = stackcache;
+       if(n)
+               stackcache = n->next;
+       runtime·unlock(&stackcachemu);
+       if(n == nil) {
+               n = (StackCacheNode*)runtime·SysAlloc(FixedStack*StackCacheBatch);
+               if(n == nil)
+                       runtime·throw("out of memory (staccachekrefill)");
+               runtime·xadd64(&mstats.stacks_sys, FixedStack*StackCacheBatch);
+               for(i = 0; i < StackCacheBatch-1; i++)
+                       n->batch[i] = (byte*)n + (i+1)*FixedStack;
+       }
+       pos = m->stackcachepos;
+       for(i = 0; i < StackCacheBatch-1; i++) {
+               m->stackcache[pos] = n->batch[i];
+               pos = (pos + 1) % StackCacheSize;
+       }
+       m->stackcache[pos] = n;
+       pos = (pos + 1) % StackCacheSize;
+       m->stackcachepos = pos;
+       m->stackcachecnt += StackCacheBatch;
+}
+
+static void
+stackcacherelease(void)
+{
+       StackCacheNode *n;
+       uint32 i, pos;
+
+       pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize;
+       n = (StackCacheNode*)m->stackcache[pos];
+       pos = (pos + 1) % StackCacheSize;
+       for(i = 0; i < StackCacheBatch-1; i++) {
+               n->batch[i] = m->stackcache[pos];
+               pos = (pos + 1) % StackCacheSize;
+       }
+       m->stackcachecnt -= StackCacheBatch;
+       runtime·lock(&stackcachemu);
+       n->next = stackcache;
+       stackcache = n;
+       runtime·unlock(&stackcachemu);
+}
+
 void*
 runtime·stackalloc(uint32 n)
 {
+       uint32 pos;
+       void *v;
+
        // Stackalloc must be called on scheduler stack, so that we
        // never try to grow the stack during the code that stackalloc runs.
        // Doing so would cause a deadlock (issue 1547).
@@ -769,7 +834,15 @@ runtime·stackalloc(uint32 n)
                        runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
                        runtime·throw("stackalloc");
                }
-               return runtime·FixAlloc_Alloc(m->stackalloc);
+               if(m->stackcachecnt == 0)
+                       stackcacherefill();
+               pos = m->stackcachepos;
+               pos = (pos - 1) % StackCacheSize;
+               v = m->stackcache[pos];
+               m->stackcachepos = pos;
+               m->stackcachecnt--;
+               m->stackinuse++;
+               return v;
        }
        return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
 }
@@ -777,8 +850,16 @@ runtime·stackalloc(uint32 n)
 void
 runtime·stackfree(void *v, uintptr n)
 {
+       uint32 pos;
+
        if(m->mallocing || m->gcing || n == FixedStack) {
-               runtime·FixAlloc_Free(m->stackalloc, v);
+               if(m->stackcachecnt == StackCacheSize)
+                       stackcacherelease();
+               pos = m->stackcachepos;
+               m->stackcache[pos] = v;
+               m->stackcachepos = (pos + 1) % StackCacheSize;
+               m->stackcachecnt++;
+               m->stackinuse--;
                return;
        }
        runtime·free(v);
index 86e879afe43ed3f7f8f689c47ede094fa26e8ccf..c7c12b49e8ca1f293dcd4cb3b93f816fe2c0b171 100644 (file)
@@ -1176,18 +1176,15 @@ cachestats(GCStats *stats)
        MCache *c;
        int32 i;
        uint64 stacks_inuse;
-       uint64 stacks_sys;
        uint64 *src, *dst;
 
        if(stats)
                runtime·memclr((byte*)stats, sizeof(*stats));
        stacks_inuse = 0;
-       stacks_sys = 0;
        for(mp=runtime·allm; mp; mp=mp->alllink) {
                c = mp->mcache;
                runtime·purgecachedstats(c);
-               stacks_inuse += mp->stackalloc->inuse;
-               stacks_sys += mp->stackalloc->sys;
+               stacks_inuse += mp->stackinuse*FixedStack;
                if(stats) {
                        src = (uint64*)&mp->gcstats;
                        dst = (uint64*)stats;
@@ -1203,7 +1200,6 @@ cachestats(GCStats *stats)
                }
        }
        mstats.stacks_inuse = stacks_inuse;
-       mstats.stacks_sys = stacks_sys;
 }
 
 // Structure of arguments passed to function gc().
index 80e97795abb3a911dce7e26fea25737cbe3c2366..eba0d6456bc1a39454715a19c330c48bbaaeb960 100644 (file)
@@ -345,8 +345,6 @@ mcommoninit(M *mp)
 {
        mp->id = runtime·sched.mcount++;
        mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
-       mp->stackalloc = runtime·malloc(sizeof(*mp->stackalloc));
-       runtime·FixAlloc_Init(mp->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
 
        if(mp->mcache == nil)
                mp->mcache = runtime·allocmcache();
index 1d51c5271e37dd27589903a94da293e374f40711..0bbf9fa175741de3685eeb4542c187d2fbebd2bd 100644 (file)
@@ -53,7 +53,7 @@ func stackGrowthRecursive(i int) {
        }
 }
 
-func BenchmarkStackGrowth(b *testing.B) {
+func benchmarkStackGrowth(b *testing.B, rec int) {
        const CallsPerSched = 1000
        procs := runtime.GOMAXPROCS(-1)
        N := int32(b.N / CallsPerSched)
@@ -63,7 +63,7 @@ func BenchmarkStackGrowth(b *testing.B) {
                        for atomic.AddInt32(&N, -1) >= 0 {
                                runtime.Gosched()
                                for g := 0; g < CallsPerSched; g++ {
-                                       stackGrowthRecursive(10)
+                                       stackGrowthRecursive(rec)
                                }
                        }
                        c <- true
@@ -74,6 +74,14 @@ func BenchmarkStackGrowth(b *testing.B) {
        }
 }
 
+func BenchmarkStackGrowth(b *testing.B) {
+       benchmarkStackGrowth(b, 10)
+}
+
+func BenchmarkStackGrowthDeep(b *testing.B) {
+       benchmarkStackGrowth(b, 1024)
+}
+
 func BenchmarkSyscall(b *testing.B) {
        const CallsPerSched = 1000
        procs := runtime.GOMAXPROCS(-1)
index a228b06e322e826b0177ff245044fe76105d9466..47a7b6e78b9c9948a1b3aa94bd78be56bbade8fb 100644 (file)
@@ -128,6 +128,13 @@ enum
 {
        PtrSize = sizeof(void*),
 };
+enum
+{
+       // Per-M stack segment cache size.
+       StackCacheSize = 32,
+       // Global <-> per-M stack segment cache transfer batch size.
+       StackCacheBatch = 16,
+};
 
 /*
  * structures
@@ -262,7 +269,10 @@ struct     M
        M*      schedlink;
        uint32  machport;       // Return address for Mach IPC (OS X)
        MCache  *mcache;
-       FixAlloc        *stackalloc;
+       int32   stackinuse;
+       uint32  stackcachepos;
+       uint32  stackcachecnt;
+       void*   stackcache[StackCacheSize];
        G*      lockedg;
        G*      idleg;
        uintptr createstack[32];        // Stack that created this thread.
index 8b75e4d121e00302a117d282f669ea8a9d491b46..f04bddc764a57d6b6f0dc66eef9052b318cb3e41 100644 (file)
@@ -34,6 +34,7 @@ package runtime_test
 import (
        . "runtime"
        "testing"
+       "time"
        "unsafe"
 )
 
@@ -1526,3 +1527,51 @@ func stack4988() (uintptr, uintptr) { var buf [4988]byte; use(buf[:]); return St
 func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() }
 func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() }
 func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() }
+
+// TestStackMem measures per-thread stack segment cache behavior.
+// The test consumed up to 500MB in the past.
+func TestStackMem(t *testing.T) {
+       const (
+               BatchSize      = 32
+               BatchCount     = 512
+               ArraySize      = 1024
+               RecursionDepth = 128
+       )
+       if testing.Short() {
+               return
+       }
+       defer GOMAXPROCS(GOMAXPROCS(BatchSize))
+       s0 := new(MemStats)
+       ReadMemStats(s0)
+       for b := 0; b < BatchCount; b++ {
+               c := make(chan bool, BatchSize)
+               for i := 0; i < BatchSize; i++ {
+                       go func() {
+                               var f func(k int, a [ArraySize]byte)
+                               f = func(k int, a [ArraySize]byte) {
+                                       if k == 0 {
+                                               time.Sleep(time.Millisecond)
+                                               return
+                                       }
+                                       f(k-1, a)
+                               }
+                               f(RecursionDepth, [ArraySize]byte{})
+                               c <- true
+                       }()
+               }
+               for i := 0; i < BatchSize; i++ {
+                       <-c
+               }
+       }
+       s1 := new(MemStats)
+       ReadMemStats(s1)
+       consumed := s1.StackSys - s0.StackSys
+       t.Logf("Consumed %vMB for stack mem", consumed>>20)
+       estimate := uint64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness.
+       if consumed > estimate {
+               t.Fatalf("Stack mem: want %v, got %v", estimate, consumed)
+       }
+       if s1.StackInuse > 1<<20 {
+               t.Fatalf("Stack inuse: want %v, got %v", 1<<20, s1.StackInuse)
+       }
+}