]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: combine small NoScan allocations
authorDmitriy Vyukov <dvyukov@google.com>
Fri, 24 Jan 2014 18:35:11 +0000 (22:35 +0400)
committerDmitriy Vyukov <dvyukov@google.com>
Fri, 24 Jan 2014 18:35:11 +0000 (22:35 +0400)
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.

json-1
allocated                 8039872      7949194      -1.13%
allocs                     105774        93776     -11.34%
cputime                 156200000    100700000     -35.53%
gc-pause-one              4908873      3814853     -22.29%
gc-pause-total            2748969      2899288      +5.47%
rss                      52674560     43560960     -17.30%
sys-gc                    3796976      3256304     -14.24%
sys-heap                 43843584     35192832     -19.73%
sys-other                 5589312      5310784      -4.98%
sys-stack                  393216       393216      +0.00%
sys-total                53623088     44153136     -17.66%
time                    156193436    100886714     -35.41%
virtual-mem             256548864    256540672      -0.00%

garbage-1
allocated                 2996885      2932982      -2.13%
allocs                      62904        55200     -12.25%
cputime                  17470000     17400000      -0.40%
gc-pause-one            932757485    925806143      -0.75%
gc-pause-total            4663787      4629030      -0.75%
rss                    1151074304   1133670400      -1.51%
sys-gc                   66068352     65085312      -1.49%
sys-heap               1039728640   1024065536      -1.51%
sys-other                38038208     37485248      -1.45%
sys-stack                 8650752      8781824      +1.52%
sys-total              1152485952   1135417920      -1.48%
time                     17478088     17418005      -0.34%
virtual-mem            1343709184   1324204032      -1.45%

LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047

src/pkg/runtime/env_posix.c
src/pkg/runtime/malloc.goc
src/pkg/runtime/malloc.h
src/pkg/runtime/mgc0.c
src/pkg/runtime/mheap.c
src/pkg/runtime/runtime.h
src/pkg/sync/pool_test.go
test/deferfin.go
test/fixedbugs/issue4618.go
test/fixedbugs/issue4667.go
test/tinyfin.go [new file with mode: 0644]

index 5847f8c8a8b27dc2b6a0d1747681ce0cff37632a..746c7ee3fde6ff0ba729f2c55f59c579a0b6fde4 100644 (file)
@@ -5,6 +5,8 @@
 // +build darwin dragonfly freebsd linux netbsd openbsd solaris windows
 
 #include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
 
 Slice syscall·envs;
 
@@ -44,15 +46,24 @@ void
 syscall·setenv_c(String k, String v)
 {
        byte *arg[2];
+       uintptr len;
 
        if(_cgo_setenv == nil)
                return;
 
-       arg[0] = runtime·malloc(k.len + 1);
+       // Objects that are explicitly freed must be at least 16 bytes in size,
+       // so that they are not allocated using tiny alloc.
+       len = k.len + 1;
+       if(len < TinySize)
+               len = TinySize;
+       arg[0] = runtime·malloc(len);
        runtime·memmove(arg[0], k.str, k.len);
        arg[0][k.len] = 0;
 
-       arg[1] = runtime·malloc(v.len + 1);
+       len = v.len + 1;
+       if(len < TinySize)
+               len = TinySize;
+       arg[1] = runtime·malloc(len);
        runtime·memmove(arg[1], v.str, v.len);
        arg[1][v.len] = 0;
 
index 739c61e4f49b93e9cbf57d26e9e774cbdb2d085d..0a0420d415bfdeba4724fe11b9ec03e859eb7443 100644 (file)
@@ -26,6 +26,8 @@ extern MStats mstats; // defined in zruntime_def_$GOOS_$GOARCH.go
 
 extern volatile intgo runtime·MemProfileRate;
 
+static void* largealloc(uint32, uintptr*);
+
 // Allocate an object of at least size bytes.
 // Small objects are allocated from the per-thread cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
@@ -34,12 +36,13 @@ void*
 runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
 {
        int32 sizeclass;
+       uintptr tinysize, size1;
        intgo rate;
        MCache *c;
        MCacheList *l;
-       uintptr npages;
-       MSpan *s;
        MLink *v;
+       byte *tiny;
+       P *p;
 
        if(size == 0) {
                // All 0-length allocations use this pointer.
@@ -59,6 +62,79 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
 
        c = m->mcache;
        if(!runtime·debug.efence && size <= MaxSmallSize) {
+               if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
+                       // Tiny allocator.
+                       //
+                       // Tiny allocator combines several tiny allocation requests
+                       // into a single memory block. The resulting memory block
+                       // is freed when all subobjects are unreachable. The subobjects
+                       // must be FlagNoScan (don't have pointers), this ensures that
+                       // the amount of potentially wasted memory is bounded.
+                       //
+                       // Size of the memory block used for combining (TinySize) is tunable.
+                       // Current setting is 16 bytes, which relates to 2x worst case memory
+                       // wastage (when all but one subobjects are unreachable).
+                       // 8 bytes would result in no wastage at all, but provides less
+                       // opportunities for combining.
+                       // 32 bytes provides more opportunities for combining,
+                       // but can lead to 4x worst case wastage.
+                       // The best case winning is 8x regardless of block size.
+                       //
+                       // Objects obtained from tiny allocator must not be freed explicitly.
+                       // So when an object will be freed explicitly, we ensure that
+                       // its size >= TinySize.
+                       //
+                       // SetFinalizer has a special case for objects potentially coming
+                       // from tiny allocator, it such case it allows to set finalizers
+                       // for an inner byte of a memory block.
+                       //
+                       // The main targets of tiny allocator are small strings and
+                       // standalone escaping variables. On a json benchmark
+                       // the allocator reduces number of allocations by ~12% and
+                       // reduces heap size by ~20%.
+
+                       p = m->p;
+                       tinysize = p->tinysize;
+                       if(size <= tinysize) {
+                               tiny = p->tiny;
+                               // Align tiny pointer for required (conservative) alignment.
+                               if((size&7) == 0)
+                                       tiny = (byte*)ROUND((uintptr)tiny, 8);
+                               else if((size&3) == 0)
+                                       tiny = (byte*)ROUND((uintptr)tiny, 4);
+                               else if((size&1) == 0)
+                                       tiny = (byte*)ROUND((uintptr)tiny, 2);
+                               size1 = size + (tiny - p->tiny);
+                               if(size1 <= tinysize) {
+                                       // The object fits into existing tiny block.
+                                       v = (MLink*)tiny;
+                                       p->tiny += size1;
+                                       p->tinysize -= size1;
+                                       m->mallocing = 0;
+                                       m->locks--;
+                                       if(m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+                                               g->stackguard0 = StackPreempt;
+                                       return v;
+                               }
+                       }
+                       // Allocate a new TinySize block.
+                       l = &c->list[TinySizeClass];
+                       if(l->list == nil)
+                               runtime·MCache_Refill(c, TinySizeClass);
+                       v = l->list;
+                       l->list = v->next;
+                       l->nlist--;
+                       ((uint64*)v)[0] = 0;
+                       ((uint64*)v)[1] = 0;
+                       // See if we need to replace the existing tiny block with the new one
+                       // based on amount of remaining free space.
+                       if(TinySize-size > tinysize) {
+                               p->tiny = (byte*)v + size;
+                               p->tinysize = TinySize - size;
+                       }
+                       size = TinySize;
+                       goto done;
+               }
                // Allocate from mcache free lists.
                // Inlined version of SizeToClass().
                if(size <= 1024-8)
@@ -78,23 +154,11 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
                        if(size > 2*sizeof(uintptr) && ((uintptr*)v)[1] != 0)
                                runtime·memclr((byte*)v, size);
                }
+       done:
                c->local_cachealloc += size;
        } else {
-               // TODO(rsc): Report tracebacks for very large allocations.
-
                // Allocate directly from heap.
-               npages = size >> PageShift;
-               if((size & PageMask) != 0)
-                       npages++;
-               s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
-               if(s == nil)
-                       runtime·throw("out of memory");
-               s->limit = (byte*)(s->start<<PageShift) + size;
-               size = npages<<PageShift;
-               v = (void*)(s->start << PageShift);
-
-               // setup for mark sweep
-               runtime·markspan(v, 0, 0, true);
+               v = largealloc(flag, &size);
        }
 
        if(flag & FlagNoGC)
@@ -151,6 +215,29 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
        return v;
 }
 
+static void*
+largealloc(uint32 flag, uintptr *sizep)
+{
+       uintptr npages, size;
+       MSpan *s;
+       void *v;
+
+       // Allocate directly from heap.
+       size = *sizep;
+       npages = size >> PageShift;
+       if((size & PageMask) != 0)
+               npages++;
+       s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
+       if(s == nil)
+               runtime·throw("out of memory");
+       s->limit = (byte*)(s->start<<PageShift) + size;
+       *sizep = npages<<PageShift;
+       v = (void*)(s->start << PageShift);
+       // setup for mark sweep
+       runtime·markspan(v, 0, 0, true);
+       return v;
+}
+
 void*
 runtime·malloc(uintptr size)
 {
@@ -182,6 +269,10 @@ runtime·free(void *v)
        }
        size = s->elemsize;
        sizeclass = s->sizeclass;
+       // Objects that are smaller than TinySize can be allocated using tiny alloc,
+       // if then such object is combined with an object with finalizer, we will crash.
+       if(size < TinySize)
+               runtime·throw("freeing too small block");
 
        if(raceenabled)
                runtime·racefree(v);
@@ -347,6 +438,9 @@ runtime·mallocinit(void)
 
        runtime·InitSizes();
 
+       if(runtime·class_to_size[TinySizeClass] != TinySize)
+               runtime·throw("bad TinySizeClass");
+
        // limit = runtime·memlimit();
        // See https://code.google.com/p/go/issues/detail?id=5049
        // TODO(rsc): Fix after 1.1.
@@ -450,7 +544,7 @@ runtime·mallocinit(void)
        m->mcache = runtime·allocmcache();
 
        // See if it works.
-       runtime·free(runtime·malloc(1));
+       runtime·free(runtime·malloc(TinySize));
 }
 
 void*
@@ -760,12 +854,17 @@ func SetFinalizer(obj Eface, finalizer Eface) {
                goto throw;
        }
        ot = (PtrType*)obj.type;
-       if(ot->elem != nil && ot->elem->size == 0) {
+       // As an implementation detail we do not run finalizers for zero-sized objects,
+       // because we use &runtime·zerobase for all such allocations.
+       if(ot->elem != nil && ot->elem->size == 0)
                return;
-       }
        if(!runtime·mlookup(obj.data, &base, &size, nil) || obj.data != base) {
-               runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
-               goto throw;
+               // As an implementation detail we allow to set finalizers for an inner byte
+               // of an object if it could come from tiny alloc (see mallocgc for details).
+               if(ot->elem == nil || (ot->elem->kind&KindNoPointers) == 0 || ot->elem->size >= TinySize) {
+                       runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
+                       goto throw;
+               }
        }
        if(finalizer.type != nil) {
                if(finalizer.type->kind != KindFunc)
index 9f34b55461d69b86fd17681a37bce5d8aad46081..8122b4b0b89ff9b7659953acfcbbd63705d2c522 100644 (file)
@@ -108,6 +108,10 @@ enum
        // Tunable constants.
        MaxSmallSize = 32<<10,
 
+       // Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
+       TinySize = 16,
+       TinySizeClass = 2,
+
        FixAllocChunk = 16<<10,         // Chunk size for FixAlloc
        MaxMHeapList = 1<<(20 - PageShift),     // Maximum page length for fixed-size list in MHeap.
        HeapAllocChunk = 1<<20,         // Chunk size for heap growth
index 8b6eeab1050659e60ebf074ec0a0dd71694f0832..609dbfece120f3b488e0e99b000701c6698e2919 100644 (file)
@@ -84,8 +84,11 @@ clearpools(void)
        }
        pools.head = nil;
 
-       // clear defer pools
        for(pp=runtime·allp; p=*pp; pp++) {
+               // clear tinyalloc pool
+               p->tiny = nil;
+               p->tinysize = 0;
+               // clear defer pools
                for(i=0; i<nelem(p->deferpool); i++)
                        p->deferpool[i] = nil;
        }
@@ -1202,6 +1205,7 @@ markroot(ParFor *desc, uint32 i)
        MSpan **allspans, *s;
        uint32 spanidx;
        G *gp;
+       void *p;
 
        USED(&desc);
        wbuf = getempty(nil);
@@ -1241,7 +1245,9 @@ markroot(ParFor *desc, uint32 i)
                                // don't mark finalized object, but scan it so we
                                // retain everything it points to.
                                spf = (SpecialFinalizer*)sp;
-                               enqueue1(&wbuf, (Obj){(void*)((s->start << PageShift) + spf->offset), s->elemsize, 0});
+                               // A finalizer can be set for an inner byte of an object, find object beginning.
+                               p = (void*)((s->start << PageShift) + spf->offset/s->elemsize*s->elemsize);
+                               enqueue1(&wbuf, (Obj){p, s->elemsize, 0});
                                enqueue1(&wbuf, (Obj){(void*)&spf->fn, PtrSize, 0});
                                enqueue1(&wbuf, (Obj){(void*)&spf->fint, PtrSize, 0});
                                enqueue1(&wbuf, (Obj){(void*)&spf->ot, PtrSize, 0});
@@ -1663,12 +1669,16 @@ sweepspan(ParFor *desc, uint32 idx)
        specialp = &s->specials;
        special = *specialp;
        while(special != nil) {
-               p = (byte*)(s->start << PageShift) + special->offset;
+               // A finalizer can be set for an inner byte of an object, find object beginning.
+               p = (byte*)(s->start << PageShift) + special->offset/size*size;
                off = (uintptr*)p - (uintptr*)arena_start;
                bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
                shift = off % wordsPerBitmapWord;
                bits = *bitp>>shift;
                if((bits & (bitAllocated|bitMarked)) == bitAllocated) {
+                       // Find the exact byte for which the special was setup
+                       // (as opposed to object beginning).
+                       p = (byte*)(s->start << PageShift) + special->offset;
                        // about to free object: splice out special record
                        y = special;
                        special = special->next;
index 920d6536829f650cd660044fb24c2e58ef15f24a..ddbcc5f72f71a4993126a359c9358032988e265b 100644 (file)
@@ -605,6 +605,8 @@ removespecial(void *p, byte kind)
        runtime·lock(&span->specialLock);
        t = &span->specials;
        while((s = *t) != nil) {
+               // This function is used for finalizers only, so we don't check for
+               // "interior" specials (p must be exactly equal to s->offset).
                if(offset == s->offset && kind == s->kind) {
                        *t = s->next;
                        runtime·unlock(&span->specialLock);
@@ -713,9 +715,9 @@ runtime·freeallspecials(MSpan *span, void *p, uintptr size)
        runtime·lock(&span->specialLock);
        t = &span->specials;
        while((s = *t) != nil) {
-               if(offset < s->offset)
+               if(offset + size <= s->offset)
                        break;
-               if(offset == s->offset) {
+               if(offset <= s->offset) {
                        *t = s->next;
                        s->next = list;
                        list = s;
index 13fb55454754e943b94a4b361f93557e573235a7..499983fd78e27d088a93c5c8bdedd5367a7158eb 100644 (file)
@@ -385,6 +385,11 @@ struct P
        MCache* mcache;
        Defer*  deferpool[5];   // pool of available Defer structs of different sizes (see panic.c)
 
+       // Allocator cache for tiny objects w/o pointers.
+       // See "Tiny allocator" comment in malloc.goc.
+       byte*   tiny;
+       uintptr tinysize;
+
        // Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
        uint64  goidcache;
        uint64  goidcacheend;
index 3bf5131ea05d8835108283be90274901d2d5b178..accf524a9ff57e0a9dd3c9e671eff28a53e23c9f 100644 (file)
@@ -73,8 +73,8 @@ func TestPoolGC(t *testing.T) {
        var fin uint32
        const N = 100
        for i := 0; i < N; i++ {
-               v := new(int)
-               runtime.SetFinalizer(v, func(vv *int) {
+               v := new(string)
+               runtime.SetFinalizer(v, func(vv *string) {
                        atomic.AddUint32(&fin, 1)
                })
                p.Put(v)
index fa5a93354dce57c851dac4d49450c72ac56bef07..80372916d20b303303483f6ea36911b20482c8dc 100644 (file)
@@ -34,17 +34,17 @@ func main() {
        for i := 0; i < N; i++ {
                go func() {
                        defer wg.Done()
-                       v := new(int)
+                       v := new(string)
                        f := func() {
-                               if *v != 0 {
+                               if *v != "" {
                                        panic("oops")
                                }
                        }
-                       if *v != 0 {
+                       if *v != "" {
                                // let the compiler think f escapes
                                sink = f
                        }
-                       runtime.SetFinalizer(v, func(p *int) {
+                       runtime.SetFinalizer(v, func(p *string) {
                                atomic.AddInt32(&count, -1)
                        })
                        defer f()
index ff91ae70673afae7cb82f94d6afb7456bda8475d..fe875b35013f9a7bf7f8d09a011eb1476a8bf309 100644 (file)
@@ -30,7 +30,7 @@ func G() {
 func main() {
        nf := testing.AllocsPerRun(100, F)
        ng := testing.AllocsPerRun(100, G)
-       if int(nf) != 1 {
+       if int(nf) > 1 {
                fmt.Printf("AllocsPerRun(100, F) = %v, want 1\n", nf)
                os.Exit(1)
        }
index 3a00a31952c327b9d93d7674801d14116c33f0cf..18d773c2cfb9a21201ec3530bdeed8141719834d 100644 (file)
@@ -26,11 +26,11 @@ func F() {
 func main() {
        nf := testing.AllocsPerRun(100, F)
        ng := testing.AllocsPerRun(100, G)
-       if int(nf) != 1 {
+       if int(nf) > 1 {
                fmt.Printf("AllocsPerRun(100, F) = %v, want 1\n", nf)
                os.Exit(1)
        }
-       if int(ng) != 1 {
+       if int(ng) > 1 {
                fmt.Printf("AllocsPerRun(100, G) = %v, want 1\n", ng)
                os.Exit(1)
        }
diff --git a/test/tinyfin.go b/test/tinyfin.go
new file mode 100644 (file)
index 0000000..8fb109f
--- /dev/null
@@ -0,0 +1,62 @@
+// run
+
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test finalizers work for tiny (combined) allocations.
+
+package main
+
+import (
+       "runtime"
+       "sync/atomic"
+       "time"
+)
+
+func main() {
+       // Does not work on 32-bits due to partially conservative GC.
+       // Try to enable when we have fully precise GC.
+       if runtime.GOARCH != "amd64" {
+               return
+       }
+       // Likewise for gccgo.
+       if runtime.Compiler == "gccgo" {
+               return
+       }
+       N := int32(100)
+       count := N
+       done := make([]bool, N)
+       for i := int32(0); i < N; i++ {
+               x := i // subject to tiny alloc
+               // the closure must be big enough to be combined
+               runtime.SetFinalizer(&x, func(p *int32) {
+                       // Check that p points to the correct subobject of the tiny allocation.
+                       // It's a bit tricky, because we can't capture another variable
+                       // with the expected value (it would be combined as well).
+                       if *p < 0 || *p >= N {
+                               println("got", *p)
+                               panic("corrupted")
+                       }
+                       if done[*p] {
+                               println("got", *p)
+                               panic("already finalized")
+                       }
+                       done[*p] = true
+                       atomic.AddInt32(&count, -1)
+               })
+       }
+       for i := 0; i < 4; i++ {
+               runtime.GC()
+               time.Sleep(10 * time.Millisecond)
+       }
+       // Some of the finalizers may not be executed,
+       // if the outermost allocations are combined with something persistent.
+       // Currently 4 int32's are combined into a 16-byte block,
+       // ensure that most of them are finalized.
+       if count >= N/4 {
+               println(count, "out of", N, "finalizer are not called")
+               panic("not all finalizers are called")
+       }
+}
+