runtime: less aggressive per-thread stack segment caching

author Dmitriy Vyukov <dvyukov@google.com>

Thu, 10 Jan 2013 05:57:06 +0000 (09:57 +0400)

committer Dmitriy Vyukov <dvyukov@google.com>

Thu, 10 Jan 2013 05:57:06 +0000 (09:57 +0400)
author Dmitriy Vyukov <dvyukov@google.com>
Thu, 10 Jan 2013 05:57:06 +0000 (09:57 +0400)
committer Dmitriy Vyukov <dvyukov@google.com>
Thu, 10 Jan 2013 05:57:06 +0000 (09:57 +0400)
diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc

index e37f8927ba8ff7c5dcc84ec2ae7fa4447c522117..847f51df7cfb468edbf830a1cdec6debc75c0d42 100644 (file)
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@@ -748,9 +748,74 @@ runtime·cnew(Type *typ)
         return ret;
  }
  
+typedef struct StackCacheNode StackCacheNode;
+struct StackCacheNode
+{
+       StackCacheNode *next;
+       void*   batch[StackCacheBatch-1];
+};
+
+static StackCacheNode *stackcache;
+static Lock stackcachemu;
+
+// stackcacherefill/stackcacherelease implement global cache of stack segments.
+// The cache is required to prevent unlimited growth of per-thread caches.
+static void
+stackcacherefill(void)
+{
+       StackCacheNode *n;
+       int32 i, pos;
+
+       runtime·lock(&stackcachemu);
+       n = stackcache;
+       if(n)
+               stackcache = n->next;
+       runtime·unlock(&stackcachemu);
+       if(n == nil) {
+               n = (StackCacheNode*)runtime·SysAlloc(FixedStack*StackCacheBatch);
+               if(n == nil)
+                       runtime·throw("out of memory (staccachekrefill)");
+               runtime·xadd64(&mstats.stacks_sys, FixedStack*StackCacheBatch);
+               for(i = 0; i < StackCacheBatch-1; i++)
+                       n->batch[i] = (byte*)n + (i+1)*FixedStack;
+       }
+       pos = m->stackcachepos;
+       for(i = 0; i < StackCacheBatch-1; i++) {
+               m->stackcache[pos] = n->batch[i];
+               pos = (pos + 1) % StackCacheSize;
+       }
+       m->stackcache[pos] = n;
+       pos = (pos + 1) % StackCacheSize;
+       m->stackcachepos = pos;
+       m->stackcachecnt += StackCacheBatch;
+}
+
+static void
+stackcacherelease(void)
+{
+       StackCacheNode *n;
+       uint32 i, pos;
+
+       pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize;
+       n = (StackCacheNode*)m->stackcache[pos];
+       pos = (pos + 1) % StackCacheSize;
+       for(i = 0; i < StackCacheBatch-1; i++) {
+               n->batch[i] = m->stackcache[pos];
+               pos = (pos + 1) % StackCacheSize;
+       }
+       m->stackcachecnt -= StackCacheBatch;
+       runtime·lock(&stackcachemu);
+       n->next = stackcache;
+       stackcache = n;
+       runtime·unlock(&stackcachemu);
+}
+
  void*
  runtime·stackalloc(uint32 n)
  {
+       uint32 pos;
+       void *v;
+
         // Stackalloc must be called on scheduler stack, so that we
         // never try to grow the stack during the code that stackalloc runs.
         // Doing so would cause a deadlock (issue 1547).
@@ -769,7 +834,15 @@ runtime·stackalloc(uint32 n)
                         runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
                         runtime·throw("stackalloc");
                 }
-               return runtime·FixAlloc_Alloc(m->stackalloc);
+               if(m->stackcachecnt == 0)
+                       stackcacherefill();
+               pos = m->stackcachepos;
+               pos = (pos - 1) % StackCacheSize;
+               v = m->stackcache[pos];
+               m->stackcachepos = pos;
+               m->stackcachecnt--;
+               m->stackinuse++;
+               return v;
         }
         return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
  }
@@ -777,8 +850,16 @@ runtime·stackalloc(uint32 n)
  void
  runtime·stackfree(void *v, uintptr n)
  {
+       uint32 pos;
+
         if(m->mallocing || m->gcing || n == FixedStack) {
-               runtime·FixAlloc_Free(m->stackalloc, v);
+               if(m->stackcachecnt == StackCacheSize)
+                       stackcacherelease();
+               pos = m->stackcachepos;
+               m->stackcache[pos] = v;
+               m->stackcachepos = (pos + 1) % StackCacheSize;
+               m->stackcachecnt++;
+               m->stackinuse--;
                 return;
         }
         runtime·free(v);
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c

index 86e879afe43ed3f7f8f689c47ede094fa26e8ccf..c7c12b49e8ca1f293dcd4cb3b93f816fe2c0b171 100644 (file)
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -1176,18 +1176,15 @@ cachestats(GCStats *stats)
         MCache *c;
         int32 i;
         uint64 stacks_inuse;
-       uint64 stacks_sys;
         uint64 *src, *dst;
  
         if(stats)
                 runtime·memclr((byte*)stats, sizeof(*stats));
         stacks_inuse = 0;
-       stacks_sys = 0;
         for(mp=runtime·allm; mp; mp=mp->alllink) {
                 c = mp->mcache;
                 runtime·purgecachedstats(c);
-               stacks_inuse += mp->stackalloc->inuse;
-               stacks_sys += mp->stackalloc->sys;
+               stacks_inuse += mp->stackinuse*FixedStack;
                 if(stats) {
                         src = (uint64*)&mp->gcstats;
                         dst = (uint64*)stats;
@@ -1203,7 +1200,6 @@ cachestats(GCStats *stats)
                 }
         }
         mstats.stacks_inuse = stacks_inuse;
-       mstats.stacks_sys = stacks_sys;
  }
  
  // Structure of arguments passed to function gc().
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c

index 80e97795abb3a911dce7e26fea25737cbe3c2366..eba0d6456bc1a39454715a19c330c48bbaaeb960 100644 (file)
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -345,8 +345,6 @@ mcommoninit(M *mp)
  {
         mp->id = runtime·sched.mcount++;
         mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
-       mp->stackalloc = runtime·malloc(sizeof(*mp->stackalloc));
-       runtime·FixAlloc_Init(mp->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
  
         if(mp->mcache == nil)
                 mp->mcache = runtime·allocmcache();
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go

index 1d51c5271e37dd27589903a94da293e374f40711..0bbf9fa175741de3685eeb4542c187d2fbebd2bd 100644 (file)
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -53,7 +53,7 @@ func stackGrowthRecursive(i int) {
         }
  }
  
-func BenchmarkStackGrowth(b *testing.B) {
+func benchmarkStackGrowth(b *testing.B, rec int) {
         const CallsPerSched = 1000
         procs := runtime.GOMAXPROCS(-1)
         N := int32(b.N / CallsPerSched)
@@ -63,7 +63,7 @@ func BenchmarkStackGrowth(b *testing.B) {
                         for atomic.AddInt32(&N, -1) >= 0 {
                                 runtime.Gosched()
                                 for g := 0; g < CallsPerSched; g++ {
-                                       stackGrowthRecursive(10)
+                                       stackGrowthRecursive(rec)
                                 }
                         }
                         c <- true
@@ -74,6 +74,14 @@ func BenchmarkStackGrowth(b *testing.B) {
         }
  }
  
+func BenchmarkStackGrowth(b *testing.B) {
+       benchmarkStackGrowth(b, 10)
+}
+
+func BenchmarkStackGrowthDeep(b *testing.B) {
+       benchmarkStackGrowth(b, 1024)
+}
+
  func BenchmarkSyscall(b *testing.B) {
         const CallsPerSched = 1000
         procs := runtime.GOMAXPROCS(-1)
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h

index a228b06e322e826b0177ff245044fe76105d9466..47a7b6e78b9c9948a1b3aa94bd78be56bbade8fb 100644 (file)
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -128,6 +128,13 @@ enum
  {
         PtrSize = sizeof(void*),
  };
+enum
+{
+       // Per-M stack segment cache size.
+       StackCacheSize = 32,
+       // Global <-> per-M stack segment cache transfer batch size.
+       StackCacheBatch = 16,
+};
  
  /*
   * structures
@@ -262,7 +269,10 @@ struct     M
         M*      schedlink;
         uint32  machport;       // Return address for Mach IPC (OS X)
         MCache  *mcache;
-       FixAlloc        *stackalloc;
+       int32   stackinuse;
+       uint32  stackcachepos;
+       uint32  stackcachecnt;
+       void*   stackcache[StackCacheSize];
         G*      lockedg;
         G*      idleg;
         uintptr createstack[32];        // Stack that created this thread.
diff --git a/src/pkg/runtime/stack_test.go b/src/pkg/runtime/stack_test.go

index 8b75e4d121e00302a117d282f669ea8a9d491b46..f04bddc764a57d6b6f0dc66eef9052b318cb3e41 100644 (file)
--- a/src/pkg/runtime/stack_test.go
+++ b/src/pkg/runtime/stack_test.go
@@ -34,6 +34,7 @@ package runtime_test
  import (
         . "runtime"
         "testing"
+       "time"
         "unsafe"
  )
  
@@ -1526,3 +1527,51 @@ func stack4988() (uintptr, uintptr) { var buf [4988]byte; use(buf[:]); return St
  func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() }
  func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() }
  func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() }
+
+// TestStackMem measures per-thread stack segment cache behavior.
+// The test consumed up to 500MB in the past.
+func TestStackMem(t *testing.T) {
+       const (
+               BatchSize      = 32
+               BatchCount     = 512
+               ArraySize      = 1024
+               RecursionDepth = 128
+       )
+       if testing.Short() {
+               return
+       }
+       defer GOMAXPROCS(GOMAXPROCS(BatchSize))
+       s0 := new(MemStats)
+       ReadMemStats(s0)
+       for b := 0; b < BatchCount; b++ {
+               c := make(chan bool, BatchSize)
+               for i := 0; i < BatchSize; i++ {
+                       go func() {
+                               var f func(k int, a [ArraySize]byte)
+                               f = func(k int, a [ArraySize]byte) {
+                                       if k == 0 {
+                                               time.Sleep(time.Millisecond)
+                                               return
+                                       }
+                                       f(k-1, a)
+                               }
+                               f(RecursionDepth, [ArraySize]byte{})
+                               c <- true
+                       }()
+               }
+               for i := 0; i < BatchSize; i++ {
+                       <-c
+               }
+       }
+       s1 := new(MemStats)
+       ReadMemStats(s1)
+       consumed := s1.StackSys - s0.StackSys
+       t.Logf("Consumed %vMB for stack mem", consumed>>20)
+       estimate := uint64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness.
+       if consumed > estimate {
+               t.Fatalf("Stack mem: want %v, got %v", estimate, consumed)
+       }
+       if s1.StackInuse > 1<<20 {
+               t.Fatalf("Stack inuse: want %v, got %v", 1<<20, s1.StackInuse)
+       }
+}
author	Dmitriy Vyukov <dvyukov@google.com>
	Thu, 10 Jan 2013 05:57:06 +0000 (09:57 +0400)
committer	Dmitriy Vyukov <dvyukov@google.com>
	Thu, 10 Jan 2013 05:57:06 +0000 (09:57 +0400)
src/pkg/runtime/malloc.goc		patch \| blob \| history
src/pkg/runtime/mgc0.c		patch \| blob \| history
src/pkg/runtime/proc.c		patch \| blob \| history
src/pkg/runtime/proc_test.go		patch \| blob \| history
src/pkg/runtime/runtime.h		patch \| blob \| history
src/pkg/runtime/stack_test.go		patch \| blob \| history