]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: concurrent GC sweep
authorDmitriy Vyukov <dvyukov@google.com>
Wed, 12 Feb 2014 18:16:42 +0000 (22:16 +0400)
committerDmitriy Vyukov <dvyukov@google.com>
Wed, 12 Feb 2014 18:16:42 +0000 (22:16 +0400)
Moves sweep phase out of stoptheworld by adding
background sweeper goroutine and lazy on-demand sweeping.

It turned out to be somewhat trickier than I expected,
because there is no point in time when we know size of live heap
nor consistent number of mallocs and frees.
So everything related to next_gc, mprof, memstats, etc becomes trickier.

At the end of GC next_gc is conservatively set to heap_alloc*GOGC,
which is much larger than real value. But after every sweep
next_gc is decremented by freed*GOGC. So when everything is swept
next_gc becomes what it should be.

For mprof I had to introduce 3-generation scheme (allocs, revent_allocs, prev_allocs),
because by the end of GC we know number of frees for the *previous* GC.

Significant caution is required to not cross yet-unknown real value of next_gc.
This is achieved by 2 means:
1. Whenever I allocate a span from MCentral, I sweep a span in that MCentral.
2. Whenever I allocate N pages from MHeap, I sweep until at least N pages are
returned to heap.
This provides quite strong guarantees that heap does not grow when it should now.

http-1
allocated                    7036         7033      -0.04%
allocs                         60           60      +0.00%
cputime                     51050        46700      -8.52%
gc-pause-one             34060569      1777993     -94.78%
gc-pause-total               2554          133     -94.79%
latency-50                 178448       170926      -4.22%
latency-95                 284350       198294     -30.26%
latency-99                 345191       220652     -36.08%
rss                     101564416    101007360      -0.55%
sys-gc                    6606832      6541296      -0.99%
sys-heap                 88801280     87752704      -1.18%
sys-other                 7334208      7405928      +0.98%
sys-stack                  524288       524288      +0.00%
sys-total               103266608    102224216      -1.01%
time                        50339        46533      -7.56%
virtual-mem             292990976    293728256      +0.25%

garbage-1
allocated                 2983818      2990889      +0.24%
allocs                      62880        62902      +0.03%
cputime                  16480000     16190000      -1.76%
gc-pause-one            828462467    487875135     -41.11%
gc-pause-total            4142312      2439375     -41.11%
rss                    1151709184   1153712128      +0.17%
sys-gc                   66068352     66068352      +0.00%
sys-heap               1039728640   1039728640      +0.00%
sys-other                37776064     40770176      +7.93%
sys-stack                 8781824      8781824      +0.00%
sys-total              1152354880   1155348992      +0.26%
time                     16496998     16199876      -1.80%
virtual-mem            1409564672   1402281984      -0.52%

LGTM=rsc
R=golang-codereviews, sameer, rsc, iant, jeremyjackins, gobot
CC=golang-codereviews, khr
https://golang.org/cl/46430043

src/pkg/runtime/malloc.goc
src/pkg/runtime/malloc.h
src/pkg/runtime/mcentral.c
src/pkg/runtime/mgc0.c
src/pkg/runtime/mheap.c
src/pkg/runtime/mprof.goc

index 76945a28d4d794746405c6417cbd926f934b72c3..babfb9e176106a682f93b034a695010e03912d57 100644 (file)
@@ -284,6 +284,10 @@ runtime·free(void *v)
        if(raceenabled)
                runtime·racefree(v);
 
+       // Ensure that the span is swept.
+       // If we free into an unswept span, we will corrupt GC bitmaps.
+       runtime·MSpan_EnsureSwept(s);
+
        if(s->specials != nil)
                runtime·freeallspecials(s, v, size);
 
index fc6c85e2c13314b562e9ac7903586f3611c0d202..ac9e6a2883c60efd7240578d34d43d3ece84c6e4 100644 (file)
@@ -403,6 +403,12 @@ struct MSpan
        PageID  start;          // starting page number
        uintptr npages;         // number of pages in span
        MLink   *freelist;      // list of free objects
+       // sweep generation:
+       // if sweepgen == h->sweepgen - 2, the span needs sweeping
+       // if sweepgen == h->sweepgen - 1, the span is currently being swept
+       // if sweepgen == h->sweepgen, the span is swept and ready to use
+       // h->sweepgen is incremented by 2 after every GC
+       uint32  sweepgen;
        uint16  ref;            // number of allocated objects in this span
        uint8   sizeclass;      // size class
        uint8   state;          // MSpanInUse etc
@@ -416,6 +422,8 @@ struct MSpan
 };
 
 void   runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages);
+void   runtime·MSpan_EnsureSwept(MSpan *span);
+bool   runtime·MSpan_Sweep(MSpan *span);
 
 // Every MSpan is in one doubly-linked list,
 // either one of the MHeap's free lists or one of the
@@ -423,6 +431,7 @@ void        runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages);
 void   runtime·MSpanList_Init(MSpan *list);
 bool   runtime·MSpanList_IsEmpty(MSpan *list);
 void   runtime·MSpanList_Insert(MSpan *list, MSpan *span);
+void   runtime·MSpanList_InsertBack(MSpan *list, MSpan *span);
 void   runtime·MSpanList_Remove(MSpan *span); // from whatever list it is in
 
 
@@ -439,7 +448,7 @@ struct MCentral
 void   runtime·MCentral_Init(MCentral *c, int32 sizeclass);
 int32  runtime·MCentral_AllocList(MCentral *c, MLink **first);
 void   runtime·MCentral_FreeList(MCentral *c, MLink *first);
-void   runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end);
+bool   runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end);
 
 // Main malloc heap.
 // The heap itself is the "free[]" and "large" arrays,
@@ -448,10 +457,15 @@ struct MHeap
 {
        Lock;
        MSpan free[MaxMHeapList];       // free lists of given length
-       MSpan large;                    // free lists length >= MaxMHeapList
-       MSpan **allspans;
+       MSpan freelarge;                // free lists length >= MaxMHeapList
+       MSpan busy[MaxMHeapList];       // busy lists of large objects of given length
+       MSpan busylarge;                // busy lists of large objects length >= MaxMHeapList
+       MSpan **allspans;               // all spans out there
+       MSpan **sweepspans;             // copy of allspans referenced by sweeper
        uint32  nspan;
        uint32  nspancap;
+       uint32  sweepgen;               // sweep generation, see comment in MSpan
+       uint32  sweepdone;              // all spans are swept
 
        // span lookup
        MSpan** spans;
@@ -487,7 +501,7 @@ struct MHeap
 extern MHeap runtime·mheap;
 
 void   runtime·MHeap_Init(MHeap *h);
-MSpan* runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed);
+MSpan* runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool zeroed);
 void   runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct);
 MSpan* runtime·MHeap_Lookup(MHeap *h, void *v);
 MSpan* runtime·MHeap_LookupMaybe(MHeap *h, void *v);
@@ -501,6 +515,7 @@ void*       runtime·mallocgc(uintptr size, uintptr typ, uint32 flag);
 void*  runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat);
 int32  runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
 void   runtime·gc(int32 force);
+uintptr        runtime·sweepone(void);
 void   runtime·markscan(void *v);
 void   runtime·marknogc(void *v);
 void   runtime·checkallocated(void *v, uintptr n);
@@ -528,7 +543,7 @@ enum
 };
 
 void   runtime·MProf_Malloc(void*, uintptr, uintptr);
-void   runtime·MProf_Free(Bucket*, void*, uintptr);
+void   runtime·MProf_Free(Bucket*, void*, uintptr, bool);
 void   runtime·MProf_GC(void);
 void   runtime·MProf_TraceGC(void);
 int32  runtime·gcprocs(void);
@@ -542,7 +557,7 @@ void        runtime·removefinalizer(void*);
 void   runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot);
 
 void   runtime·freeallspecials(MSpan *span, void *p, uintptr size);
-bool   runtime·freespecial(Special *s, void *p, uintptr size);
+bool   runtime·freespecial(Special *s, void *p, uintptr size, bool freed);
 
 enum
 {
index 735a7e6a9a38c38c3c5a872ff5b3be96153fabac..d96a73394da2a7123900913fbc70a6c8497c2dc6 100644 (file)
@@ -39,17 +39,58 @@ runtime·MCentral_AllocList(MCentral *c, MLink **pfirst)
 {
        MSpan *s;
        int32 cap, n;
+       uint32 sg;
 
        runtime·lock(c);
-       // Replenish central list if empty.
-       if(runtime·MSpanList_IsEmpty(&c->nonempty)) {
-               if(!MCentral_Grow(c)) {
+       sg = runtime·mheap.sweepgen;
+retry:
+       for(s = c->nonempty.next; s != &c->nonempty; s = s->next) {
+               if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
                        runtime·unlock(c);
-                       *pfirst = nil;
-                       return 0;
+                       runtime·MSpan_Sweep(s);
+                       runtime·lock(c);
+                       // the span could have been moved to heap, retry
+                       goto retry;
+               }
+               if(s->sweepgen == sg-1) {
+                       // the span is being swept by background sweeper, skip
+                       continue;
+               }
+               // we have a nonempty span that does not require sweeping, allocate from it
+               goto havespan;
+       }
+
+       for(s = c->empty.next; s != &c->empty; s = s->next) {
+               if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
+                       // we have an empty span that requires sweeping,
+                       // sweep it and see if we can free some space in it
+                       runtime·MSpanList_Remove(s);
+                       // swept spans are at the end of the list
+                       runtime·MSpanList_InsertBack(&c->empty, s);
+                       runtime·unlock(c);
+                       runtime·MSpan_Sweep(s);
+                       runtime·lock(c);
+                       // the span could be moved to nonempty or heap, retry
+                       goto retry;
+               }
+               if(s->sweepgen == sg-1) {
+                       // the span is being swept by background sweeper, skip
+                       continue;
                }
+               // already swept empty span,
+               // all subsequent ones must also be either swept or in process of sweeping
+               break;
+       }
+
+       // Replenish central list if empty.
+       if(!MCentral_Grow(c)) {
+               runtime·unlock(c);
+               *pfirst = nil;
+               return 0;
        }
        s = c->nonempty.next;
+
+havespan:
        cap = (s->npages << PageShift) / s->elemsize;
        n = cap - s->ref;
        *pfirst = s->freelist;
@@ -57,7 +98,7 @@ runtime·MCentral_AllocList(MCentral *c, MLink **pfirst)
        s->ref += n;
        c->nfree -= n;
        runtime·MSpanList_Remove(s);
-       runtime·MSpanList_Insert(&c->empty, s);
+       runtime·MSpanList_InsertBack(&c->empty, s);
        runtime·unlock(c);
        return n;
 }
@@ -116,8 +157,9 @@ MCentral_Free(MCentral *c, void *v)
 }
 
 // Free n objects from a span s back into the central free list c.
-// Called from GC.
-void
+// Called during sweep.
+// Returns true if the span was returned to heap.
+bool
 runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end)
 {
        int32 size;
@@ -136,19 +178,21 @@ runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *
        s->ref -= n;
        c->nfree += n;
 
-       // If s is completely freed, return it to the heap.
-       if(s->ref == 0) {
-               size = runtime·class_to_size[c->sizeclass];
-               runtime·MSpanList_Remove(s);
-               *(uintptr*)(s->start<<PageShift) = 1;  // needs zeroing
-               s->freelist = nil;
-               c->nfree -= (s->npages << PageShift) / size;
-               runtime·unlock(c);
-               runtime·unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
-               runtime·MHeap_Free(&runtime·mheap, s, 0);
-       } else {
+       if(s->ref != 0) {
                runtime·unlock(c);
+               return false;
        }
+
+       // s is completely freed, return it to the heap.
+       size = runtime·class_to_size[c->sizeclass];
+       runtime·MSpanList_Remove(s);
+       *(uintptr*)(s->start<<PageShift) = 1;  // needs zeroing
+       s->freelist = nil;
+       c->nfree -= (s->npages << PageShift) / size;
+       runtime·unlock(c);
+       runtime·unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
+       runtime·MHeap_Free(&runtime·mheap, s, 0);
+       return true;
 }
 
 void
index dc2a8a99bbd652624591132a845bc1f7d6a6bd8f..02872759b14b7165ca50344f5b3ae6d9ba16d694 100644 (file)
@@ -2,7 +2,53 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Garbage collector.
+// Garbage collector (GC).
+//
+// GC is:
+// - mark&sweep
+// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
+// - parallel (up to MaxGcproc threads)
+// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
+// - non-moving/non-compacting
+// - full (non-partial)
+//
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+//
+// Concurrent sweep.
+// The sweep phase proceeds concurrently with normal program execution.
+// The heap is swept span-by-span both lazily (when a goroutine needs another span)
+// and concurrently in a background goroutine (this helps programs that are not CPU bound).
+// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
+// and so next_gc calculation is tricky and happens as follows.
+// At the end of the stop-the-world phase next_gc is conservatively set based on total
+// heap size; all spans are marked as "needs sweeping".
+// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
+// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
+// closer to the target value. However, this is not enough to avoid over-allocating memory.
+// Consider that a goroutine wants to allocate a new span for a large object and
+// there are no free swept spans, but there are small-object unswept spans.
+// If the goroutine naively allocates a new span, it can surpass the yet-unknown
+// target next_gc value. In order to prevent such cases (1) when a goroutine needs
+// to allocate a new small-object span, it sweeps small-object spans for the same
+// object size until it frees at least one object; (2) when a goroutine needs to
+// allocate large-object span from heap, it sweeps spans until it frees at least
+// that many pages into heap. Together these two measures ensure that we don't surpass
+// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
+// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
+// but there can still be other one-page unswept spans which could be combined into a two-page span.
+// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+// The finalizer goroutine is kicked off only when all spans are swept.
+// When the next GC starts, it sweeps all not-yet-swept spans (if any).
 
 #include "runtime.h"
 #include "arch_GOARCH.h"
@@ -52,6 +98,11 @@ enum {
        RootCount       = 5,
 };
 
+#define GcpercentUnknown (-2)
+
+// Initialized from $GOGC.  GOGC=off means no gc.
+static int32 gcpercent = GcpercentUnknown;
+
 static struct
 {
        Lock;  
@@ -197,14 +248,15 @@ extern byte ebss[];
 extern byte gcdata[];
 extern byte gcbss[];
 
-static G *fing;
-static FinBlock *finq; // list of finalizers that are to be executed
-static FinBlock *finc; // cache of free blocks
-static FinBlock *allfin; // list of all blocks
-static Lock finlock;
-static int32 fingwait;
+static G       *fing;
+static FinBlock        *finq; // list of finalizers that are to be executed
+static FinBlock        *finc; // cache of free blocks
+static FinBlock        *allfin; // list of all blocks
+static int32   fingwait;
+static Lock    gclock;
 
-static void runfinq(void);
+static void    runfinq(void);
+static void    bgsweep(void);
 static Workbuf* getempty(Workbuf*);
 static Workbuf* getfull(Workbuf*);
 static void    putempty(Workbuf*);
@@ -215,6 +267,9 @@ static void flushallmcaches(void);
 static void    scanframe(Stkframe *frame, void *wbufp);
 static void    addstackroots(G *gp, Workbuf **wbufp);
 
+static FuncVal runfinqv = {runfinq};
+static FuncVal bgsweepv = {bgsweep};
+
 static struct {
        uint64  full;  // lock-free list of full blocks
        uint64  empty; // lock-free list of empty blocks
@@ -225,7 +280,6 @@ static struct {
        volatile uint32 ndone;
        Note    alldone;
        ParFor  *markfor;
-       ParFor  *sweepfor;
 
        Lock;
        byte    *chunk;
@@ -266,6 +320,8 @@ static struct {
                uint64 foundword;
                uint64 foundspan;
        } markonly;
+       uint32 nbgsweep;
+       uint32 npausesweep;
 } gcstats;
 
 // markonly marks an object. It returns true if the object
@@ -1209,8 +1265,9 @@ markroot(ParFor *desc, uint32 i)
 {
        Workbuf *wbuf;
        FinBlock *fb;
+       MHeap *h;
        MSpan **allspans, *s;
-       uint32 spanidx;
+       uint32 spanidx, sg;
        G *gp;
        void *p;
 
@@ -1232,12 +1289,16 @@ markroot(ParFor *desc, uint32 i)
 
        case RootSpanTypes:
                // mark span types and MSpan.specials (to walk spans only once)
-               allspans = runtime·mheap.allspans;
+               h = &runtime·mheap;
+               sg = h->sweepgen;
+               allspans = h->allspans;
                for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
                        Special *sp;
                        SpecialFinalizer *spf;
 
                        s = allspans[spanidx];
+                       if(s->sweepgen != sg)
+                               runtime·throw("gc: unswept span");
                        if(s->state != MSpanInUse)
                                continue;
                        // The garbage collector ignores type pointers stored in MSpan.types:
@@ -1601,7 +1662,7 @@ runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType
        FinBlock *block;
        Finalizer *f;
 
-       runtime·lock(&finlock);
+       runtime·lock(&gclock);
        if(finq == nil || finq->cnt == finq->cap) {
                if(finc == nil) {
                        finc = runtime·persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
@@ -1621,13 +1682,31 @@ runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType
        f->fint = fint;
        f->ot = ot;
        f->arg = p;
-       runtime·unlock(&finlock);
+       runtime·unlock(&gclock);
+}
+
+void
+runtime·MSpan_EnsureSwept(MSpan *s)
+{
+       uint32 sg;
+
+       sg = runtime·mheap.sweepgen;
+       if(runtime·atomicload(&s->sweepgen) == sg)
+               return;
+       if(runtime·cas(&s->sweepgen, sg-2, sg-1)) {
+               runtime·MSpan_Sweep(s);
+               return;
+       }
+       // unfortunate condition, and we don't have efficient means to wait
+       while(runtime·atomicload(&s->sweepgen) != sg)
+               runtime·osyield();  
 }
 
 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
-static void
-sweepspan(ParFor *desc, uint32 idx)
+// Returns true if the span was returned to heap.
+bool
+runtime·MSpan_Sweep(MSpan *s)
 {
        int32 cl, n, npages;
        uintptr size, off, *bitp, shift, bits;
@@ -1639,14 +1718,15 @@ sweepspan(ParFor *desc, uint32 idx)
        byte *type_data;
        byte compression;
        uintptr type_data_inc;
-       MSpan *s;
        MLink *x;
        Special *special, **specialp, *y;
+       bool res, sweepgenset;
 
-       USED(&desc);
-       s = runtime·mheap.allspans[idx];
-       if(s->state != MSpanInUse)
-               return;
+       if(s->state != MSpanInUse || s->sweepgen != runtime·mheap.sweepgen-1) {
+               runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
+                       s->state, s->sweepgen, runtime·mheap.sweepgen);
+               runtime·throw("MSpan_Sweep: bad span state");
+       }
        arena_start = runtime·mheap.arena_start;
        cl = s->sizeclass;
        size = s->elemsize;
@@ -1657,9 +1737,11 @@ sweepspan(ParFor *desc, uint32 idx)
                npages = runtime·class_to_allocnpages[cl];
                n = (npages << PageShift) / size;
        }
+       res = false;
        nfree = 0;
        end = &head;
        c = m->mcache;
+       sweepgenset = false;
 
        // mark any free objects in this span so we don't collect them
        for(x = s->freelist; x != nil; x = x->next) {
@@ -1690,7 +1772,7 @@ sweepspan(ParFor *desc, uint32 idx)
                        y = special;
                        special = special->next;
                        *specialp = special;
-                       if(!runtime·freespecial(y, p, size)) {
+                       if(!runtime·freespecial(y, p, size, false)) {
                                // stop freeing of object if it has a finalizer
                                *bitp |= bitMarked << shift;
                        }
@@ -1736,12 +1818,17 @@ sweepspan(ParFor *desc, uint32 idx)
                        // Free large span.
                        runtime·unmarkspan(p, 1<<PageShift);
                        *(uintptr*)p = (uintptr)0xdeaddeaddeaddeadll;   // needs zeroing
+                       // important to set sweepgen before returning it to heap
+                       runtime·atomicstore(&s->sweepgen, runtime·mheap.sweepgen);
+                       sweepgenset = true;
                        if(runtime·debug.efence)
                                runtime·SysFree(p, size, &mstats.gc_sys);
                        else
                                runtime·MHeap_Free(&runtime·mheap, s, 1);
                        c->local_nlargefree++;
                        c->local_largefree += size;
+                       runtime·xadd64(&mstats.next_gc, -(uint64)(size * (gcpercent + 100)/100));
+                       res = true;
                } else {
                        // Free small object.
                        switch(compression) {
@@ -1763,10 +1850,86 @@ sweepspan(ParFor *desc, uint32 idx)
                }
        }
 
+       if(!sweepgenset)
+               runtime·atomicstore(&s->sweepgen, runtime·mheap.sweepgen);
        if(nfree) {
                c->local_nsmallfree[cl] += nfree;
                c->local_cachealloc -= nfree * size;
-               runtime·MCentral_FreeSpan(&runtime·mheap.central[cl], s, nfree, head.next, end);
+               runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (gcpercent + 100)/100));
+               res = runtime·MCentral_FreeSpan(&runtime·mheap.central[cl], s, nfree, head.next, end);
+       }
+       return res;
+}
+
+// State of background sweep.
+// Pretected by gclock.
+static struct
+{
+       G*      g;
+       bool    parked;
+
+       MSpan** spans;
+       uint32  nspan;
+       uint32  spanidx;
+} sweep;
+
+// background sweeping goroutine
+static void
+bgsweep(void)
+{
+       g->issystem = 1;
+       for(;;) {
+               while(runtime·sweepone() != -1) {
+                       gcstats.nbgsweep++;
+                       runtime·gosched();
+               }
+               runtime·lock(&gclock);
+               if(finq != nil) {
+                       // kick off or wake up goroutine to run queued finalizers
+                       if(fing == nil)
+                               fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
+                       else if(fingwait) {
+                               fingwait = 0;
+                               runtime·ready(fing);
+                       }
+               }
+               sweep.parked = true;
+               runtime·parkunlock(&gclock, "GC sweep wait");
+       }
+}
+
+// sweeps one span
+// returns number of pages returned to heap, or -1 if there is nothing to sweep
+uintptr
+runtime·sweepone(void)
+{
+       MSpan *s;
+       uint32 idx, sg;
+       uintptr npages;
+
+       // increment locks to ensure that the goroutine is not preempted
+       // in the middle of sweep thus leaving the span in an inconsistent state for next GC
+       m->locks++;
+       sg = runtime·mheap.sweepgen;
+       for(;;) {
+               idx = runtime·xadd(&sweep.spanidx, 1) - 1;
+               if(idx >= sweep.nspan) {
+                       runtime·mheap.sweepdone = true;
+                       m->locks--;
+                       return -1;
+               }
+               s = sweep.spans[idx];
+               if(s->state != MSpanInUse) {
+                       s->sweepgen = sg;
+                       continue;
+               }
+               if(s->sweepgen != sg-2 || !runtime·cas(&s->sweepgen, sg-2, sg-1))
+                       continue;
+               npages = s->npages;
+               if(!runtime·MSpan_Sweep(s))
+                       npages = 0;
+               m->locks--;
+               return npages;
        }
 }
 
@@ -1859,26 +2022,12 @@ runtime·gchelper(void)
        // help other threads scan secondary blocks
        scanblock(nil, true);
 
-       runtime·parfordo(work.sweepfor);
        bufferList[m->helpgc].busy = 0;
        nproc = work.nproc;  // work.nproc can change right after we increment work.ndone
        if(runtime·xadd(&work.ndone, +1) == nproc-1)
                runtime·notewakeup(&work.alldone);
 }
 
-#define GcpercentUnknown (-2)
-
-// Initialized from $GOGC.  GOGC=off means no gc.
-//
-// Next gc is after we've allocated an extra amount of
-// memory proportional to the amount already in use.
-// If gcpercent=100 and we're using 4M, we'll gc again
-// when we get to 8M.  This keeps the gc cost in linear
-// proportion to the allocation cost.  Adjusting gcpercent
-// just changes the linear constant (and also the amount of
-// extra memory used).
-static int32 gcpercent = GcpercentUnknown;
-
 static void
 cachestats(void)
 {
@@ -2088,21 +2237,6 @@ runtime·gc(int32 force)
        runtime·semrelease(&runtime·worldsema);
        runtime·starttheworld();
        m->locks--;
-
-       // now that gc is done, kick off finalizer thread if needed
-       if(finq != nil) {
-               runtime·lock(&finlock);
-               // kick off or wake up goroutine to run queued finalizers
-               if(fing == nil)
-                       fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
-               else if(fingwait) {
-                       fingwait = 0;
-                       runtime·ready(fing);
-               }
-               runtime·unlock(&finlock);
-       }
-       // give the queued finalizers, if any, a chance to run
-       runtime·gosched();
 }
 
 static void
@@ -2118,7 +2252,7 @@ static void
 gc(struct gc_args *args)
 {
        int64 t0, t1, t2, t3, t4;
-       uint64 heap0, heap1, obj0, obj1, ninstr;
+       uint64 heap0, heap1, obj, ninstr;
        GCStats stats;
        M *mp;
        uint32 i;
@@ -2133,19 +2267,9 @@ gc(struct gc_args *args)
        for(mp=runtime·allm; mp; mp=mp->alllink)
                runtime·settype_flush(mp);
 
-       heap0 = 0;
-       obj0 = 0;
-       if(runtime·debug.gctrace) {
-               updatememstats(nil);
-               heap0 = mstats.heap_alloc;
-               obj0 = mstats.nmalloc - mstats.nfree;
-       }
-
        m->locks++;     // disable gc during mallocs in parforalloc
        if(work.markfor == nil)
                work.markfor = runtime·parforalloc(MaxGcproc);
-       if(work.sweepfor == nil)
-               work.sweepfor = runtime·parforalloc(MaxGcproc);
        m->locks--;
 
        if(itabtype == nil) {
@@ -2154,32 +2278,39 @@ gc(struct gc_args *args)
                itabtype = ((PtrType*)eface.type)->elem;
        }
 
+       t1 = runtime·nanotime();
+
+       // Sweep what is not sweeped by bgsweep.
+       while(runtime·sweepone() != -1)
+               gcstats.npausesweep++;
+
        work.nwait = 0;
        work.ndone = 0;
        work.nproc = runtime·gcprocs();
        runtime·parforsetup(work.markfor, work.nproc, RootCount + runtime·allglen, nil, false, markroot);
-       runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
        if(work.nproc > 1) {
                runtime·noteclear(&work.alldone);
                runtime·helpgc(work.nproc);
        }
 
-       t1 = runtime·nanotime();
+       t2 = runtime·nanotime();
 
        gchelperstart();
        runtime·parfordo(work.markfor);
        scanblock(nil, true);
 
-       t2 = runtime·nanotime();
-
-       runtime·parfordo(work.sweepfor);
-       bufferList[m->helpgc].busy = 0;
        t3 = runtime·nanotime();
 
+       bufferList[m->helpgc].busy = 0;
        if(work.nproc > 1)
                runtime·notesleep(&work.alldone);
 
        cachestats();
+       // next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
+       // estimate what was live heap size after previous GC (for tracing only)
+       heap0 = mstats.next_gc*100/(gcpercent+100);
+       // conservatively set next_gc to high value assuming that everything is live
+       // concurrent/lazy sweep will reduce this number while discovering new garbage
        mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
 
        t4 = runtime·nanotime();
@@ -2193,20 +2324,23 @@ gc(struct gc_args *args)
        if(runtime·debug.gctrace) {
                updatememstats(&stats);
                heap1 = mstats.heap_alloc;
-               obj1 = mstats.nmalloc - mstats.nfree;
+               obj = mstats.nmalloc - mstats.nfree;
 
-               stats.nprocyield += work.sweepfor->nprocyield;
-               stats.nosyield += work.sweepfor->nosyield;
-               stats.nsleep += work.sweepfor->nsleep;
+               stats.nprocyield += work.markfor->nprocyield;
+               stats.nosyield += work.markfor->nosyield;
+               stats.nsleep += work.markfor->nsleep;
 
-               runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
+               runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB, %D (%D-%D) objects,"
+                               " %d/%d/%d sweeps,"
                                " %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
-                       mstats.numgc, work.nproc, (t2-t1)/1000000, (t3-t2)/1000000, (t1-t0+t4-t3)/1000000,
-                       heap0>>20, heap1>>20, obj0, obj1,
+                       mstats.numgc, work.nproc, (t3-t2)/1000000, (t2-t1)/1000000, (t1-t0+t4-t3)/1000000,
+                       heap0>>20, heap1>>20, obj,
                        mstats.nmalloc, mstats.nfree,
+                       sweep.nspan, gcstats.nbgsweep, gcstats.npausesweep,
                        stats.nhandoff, stats.nhandoffcnt,
-                       work.sweepfor->nsteal, work.sweepfor->nstealcnt,
+                       work.markfor->nsteal, work.markfor->nstealcnt,
                        stats.nprocyield, stats.nosyield, stats.nsleep);
+               gcstats.nbgsweep = gcstats.npausesweep = 0;
                if(CollectStats) {
                        runtime·printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
                                gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
@@ -2233,6 +2367,31 @@ gc(struct gc_args *args)
                }
        }
 
+       // We cache current runtime·mheap.allspans array in sweep.spans,
+       // because the former can be resized and freed.
+       // Otherwise we would need to take heap lock every time
+       // we want to convert span index to span pointer.
+
+       // Free the old cached array if necessary.
+       if(sweep.spans && sweep.spans != runtime·mheap.allspans)
+               runtime·SysFree(sweep.spans, sweep.nspan*sizeof(sweep.spans[0]), &mstats.other_sys);
+       // Cache the current array.
+       runtime·mheap.sweepspans = runtime·mheap.allspans;
+       runtime·mheap.sweepgen += 2;
+       runtime·mheap.sweepdone = false;
+       sweep.spans = runtime·mheap.allspans;
+       sweep.nspan = runtime·mheap.nspan;
+       sweep.spanidx = 0;
+
+       runtime·lock(&gclock);
+       if(sweep.g == nil)
+               sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, runtime·gc);
+       else if(sweep.parked) {
+               sweep.parked = false;
+               runtime·ready(sweep.g);
+       }
+       runtime·unlock(&gclock);
+
        runtime·MProf_GC();
 }
 
@@ -2327,15 +2486,15 @@ runfinq(void)
        frame = nil;
        framecap = 0;
        for(;;) {
-               runtime·lock(&finlock);
+               runtime·lock(&gclock);
                fb = finq;
                finq = nil;
                if(fb == nil) {
                        fingwait = 1;
-                       runtime·parkunlock(&finlock, "finalizer wait");
+                       runtime·parkunlock(&gclock, "finalizer wait");
                        continue;
                }
-               runtime·unlock(&finlock);
+               runtime·unlock(&gclock);
                if(raceenabled)
                        runtime·racefingo();
                for(; fb; fb=next) {
index ddbcc5f72f71a4993126a359c9358032988e265b..05cc80a3451dcfa0a006da9e2adb0dda8eedeb91 100644 (file)
@@ -41,7 +41,10 @@ RecordSpan(void *vh, byte *p)
                        runtime·throw("runtime: cannot allocate memory");
                if(h->allspans) {
                        runtime·memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
-                       runtime·SysFree(h->allspans, h->nspancap*sizeof(all[0]), &mstats.other_sys);
+                       // Don't free the old array if it's referenced by sweep.
+                       // See the comment in mgc0.c.
+                       if(h->allspans != runtime·mheap.sweepspans)
+                               runtime·SysFree(h->allspans, h->nspancap*sizeof(all[0]), &mstats.other_sys);
                }
                h->allspans = all;
                h->nspancap = cap;
@@ -60,9 +63,12 @@ runtime·MHeap_Init(MHeap *h)
        runtime·FixAlloc_Init(&h->specialfinalizeralloc, sizeof(SpecialFinalizer), nil, nil, &mstats.other_sys);
        runtime·FixAlloc_Init(&h->specialprofilealloc, sizeof(SpecialProfile), nil, nil, &mstats.other_sys);
        // h->mapcache needs no init
-       for(i=0; i<nelem(h->free); i++)
+       for(i=0; i<nelem(h->free); i++) {
                runtime·MSpanList_Init(&h->free[i]);
-       runtime·MSpanList_Init(&h->large);
+               runtime·MSpanList_Init(&h->busy[i]);
+       }
+       runtime·MSpanList_Init(&h->freelarge);
+       runtime·MSpanList_Init(&h->busylarge);
        for(i=0; i<nelem(h->central); i++)
                runtime·MCentral_Init(&h->central[i], i);
 }
@@ -83,10 +89,86 @@ runtime·MHeap_MapSpans(MHeap *h)
        h->spans_mapped = n;
 }
 
+// Sweeps spans in list until reclaims at least npages into heap.
+// Returns the actual number of pages reclaimed.
+static uintptr
+MHeap_ReclaimList(MHeap *h, MSpan *list, uintptr npages)
+{
+       MSpan *s;
+       uintptr n;
+       uint32 sg;
+
+       n = 0;
+       sg = runtime·mheap.sweepgen;
+retry:
+       for(s = list->next; s != list; s = s->next) {
+               if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
+                       runtime·MSpanList_Remove(s);
+                       // swept spans are at the end of the list
+                       runtime·MSpanList_InsertBack(list, s);
+                       runtime·unlock(h);
+                       n += runtime·MSpan_Sweep(s);
+                       runtime·lock(h);
+                       if(n >= npages)
+                               return n;
+                       // the span could have been moved elsewhere
+                       goto retry;
+               }
+               if(s->sweepgen == sg-1) {
+                       // the span is being sweept by background sweeper, skip
+                       continue;
+               }
+               // already swept empty span,
+               // all subsequent ones must also be either swept or in process of sweeping
+               break;
+       }
+       return n;
+}
+
+// Sweeps and reclaims at least npage pages into heap.
+// Called before allocating npage pages.
+static void
+MHeap_Reclaim(MHeap *h, uintptr npage)
+{
+       uintptr reclaimed, n;
+
+       // First try to sweep busy spans with large objects of size >= npage,
+       // this has good chances of reclaiming the necessary space.
+       for(n=npage; n < nelem(h->busy); n++) {
+               if(MHeap_ReclaimList(h, &h->busy[n], npage))
+                       return;  // Bingo!
+       }
+
+       // Then -- even larger objects.
+       if(MHeap_ReclaimList(h, &h->busylarge, npage))
+               return;  // Bingo!
+
+       // Now try smaller objects.
+       // One such object is not enough, so we need to reclaim several of them.
+       reclaimed = 0;
+       for(n=0; n < npage && n < nelem(h->busy); n++) {
+               reclaimed += MHeap_ReclaimList(h, &h->busy[n], npage-reclaimed);
+               if(reclaimed >= npage)
+                       return;
+       }
+
+       // Now sweep everything that is not yet swept.
+       runtime·unlock(h);
+       for(;;) {
+               n = runtime·sweepone();
+               if(n == -1)  // all spans are swept
+                       break;
+               reclaimed += n;
+               if(reclaimed >= npage)
+                       break;
+       }
+       runtime·lock(h);
+}
+
 // Allocate a new span of npage pages from the heap
 // and record its size class in the HeapMap and HeapMapCache.
 MSpan*
-runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed)
+runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool zeroed)
 {
        MSpan *s;
 
@@ -96,9 +178,14 @@ runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32
        s = MHeap_AllocLocked(h, npage, sizeclass);
        if(s != nil) {
                mstats.heap_inuse += npage<<PageShift;
-               if(acct) {
+               if(large) {
                        mstats.heap_objects++;
                        mstats.heap_alloc += npage<<PageShift;
+                       // Swept spans are at the end of lists.
+                       if(s->npages < nelem(h->free))
+                               runtime·MSpanList_InsertBack(&h->busy[s->npages], s);
+                       else
+                               runtime·MSpanList_InsertBack(&h->busylarge, s);
                }
        }
        runtime·unlock(h);
@@ -114,6 +201,11 @@ MHeap_AllocLocked(MHeap *h, uintptr npage, int32 sizeclass)
        MSpan *s, *t;
        PageID p;
 
+       // To prevent excessive heap growth, before allocating n pages
+       // we need to sweep and reclaim at least n pages.
+       if(!h->sweepdone)
+               MHeap_Reclaim(h, npage);
+
        // Try in fixed-size lists up to max.
        for(n=npage; n < nelem(h->free); n++) {
                if(!runtime·MSpanList_IsEmpty(&h->free[n])) {
@@ -137,6 +229,7 @@ HaveSpan:
        if(s->npages < npage)
                runtime·throw("MHeap_AllocLocked - bad npages");
        runtime·MSpanList_Remove(s);
+       runtime·atomicstore(&s->sweepgen, h->sweepgen);
        s->state = MSpanInUse;
        mstats.heap_idle -= s->npages<<PageShift;
        mstats.heap_released -= s->npreleased<<PageShift;
@@ -174,6 +267,7 @@ HaveSpan:
                h->spans[p] = t;
                h->spans[p+t->npages-1] = t;
                *(uintptr*)(t->start<<PageShift) = *(uintptr*)(s->start<<PageShift);  // copy "needs zeroing" mark
+               runtime·atomicstore(&t->sweepgen, h->sweepgen);
                t->state = MSpanInUse;
                MHeap_FreeLocked(h, t);
                t->unusedsince = s->unusedsince; // preserve age
@@ -196,7 +290,7 @@ HaveSpan:
 static MSpan*
 MHeap_AllocLarge(MHeap *h, uintptr npage)
 {
-       return BestFit(&h->large, npage, nil);
+       return BestFit(&h->freelarge, npage, nil);
 }
 
 // Search list for smallest span with >= npage pages.
@@ -257,6 +351,7 @@ MHeap_Grow(MHeap *h, uintptr npage)
        p -= ((uintptr)h->arena_start>>PageShift);
        h->spans[p] = s;
        h->spans[p + s->npages - 1] = s;
+       runtime·atomicstore(&s->sweepgen, h->sweepgen);
        s->state = MSpanInUse;
        MHeap_FreeLocked(h, s);
        return true;
@@ -324,8 +419,9 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
 
        s->types.compression = MTypes_Empty;
 
-       if(s->state != MSpanInUse || s->ref != 0) {
-               runtime·printf("MHeap_FreeLocked - span %p ptr %p state %d ref %d\n", s, s->start<<PageShift, s->state, s->ref);
+       if(s->state != MSpanInUse || s->ref != 0 || s->sweepgen != h->sweepgen) {
+               runtime·printf("MHeap_FreeLocked - span %p ptr %p state %d ref %d sweepgen %d/%d\n",
+                       s, s->start<<PageShift, s->state, s->ref, s->sweepgen, h->sweepgen);
                runtime·throw("MHeap_FreeLocked - invalid free");
        }
        mstats.heap_idle += s->npages<<PageShift;
@@ -371,7 +467,7 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
        if(s->npages < nelem(h->free))
                runtime·MSpanList_Insert(&h->free[s->npages], s);
        else
-               runtime·MSpanList_Insert(&h->large, s);
+               runtime·MSpanList_Insert(&h->freelarge, s);
 }
 
 static void
@@ -414,7 +510,7 @@ scavenge(int32 k, uint64 now, uint64 limit)
        sumreleased = 0;
        for(i=0; i < nelem(h->free); i++)
                sumreleased += scavengelist(&h->free[i], now, limit);
-       sumreleased += scavengelist(&h->large, now, limit);
+       sumreleased += scavengelist(&h->freelarge, now, limit);
 
        if(runtime·debug.gctrace > 0) {
                if(sumreleased > 0)
@@ -499,7 +595,7 @@ runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages)
        span->ref = 0;
        span->sizeclass = 0;
        span->elemsize = 0;
-       span->state = 0;
+       span->state = MSpanDead;
        span->unusedsince = 0;
        span->npreleased = 0;
        span->types.compression = MTypes_Empty;
@@ -546,6 +642,19 @@ runtime·MSpanList_Insert(MSpan *list, MSpan *span)
        span->prev->next = span;
 }
 
+void
+runtime·MSpanList_InsertBack(MSpan *list, MSpan *span)
+{
+       if(span->next != nil || span->prev != nil) {
+               runtime·printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
+               runtime·throw("MSpanList_Insert");
+       }
+       span->next = list;
+       span->prev = list->prev;
+       span->next->prev = span;
+       span->prev->next = span;
+}
+
 // Adds the special record s to the list of special records for
 // the object p.  All fields of s should be filled in except for
 // offset & next, which this routine will fill in.
@@ -563,6 +672,11 @@ addspecial(void *p, Special *s)
        span = runtime·MHeap_LookupMaybe(&runtime·mheap, p);
        if(span == nil)
                runtime·throw("addspecial on invalid pointer");
+
+       // Ensure that the span is swept.
+       // GC accesses specials list w/o locks. And it's just much safer.
+       runtime·MSpan_EnsureSwept(span);
+
        offset = (uintptr)p - (span->start << PageShift);
        kind = s->kind;
 
@@ -600,6 +714,11 @@ removespecial(void *p, byte kind)
        span = runtime·MHeap_LookupMaybe(&runtime·mheap, p);
        if(span == nil)
                runtime·throw("removespecial on invalid pointer");
+
+       // Ensure that the span is swept.
+       // GC accesses specials list w/o locks. And it's just much safer.
+       runtime·MSpan_EnsureSwept(span);
+
        offset = (uintptr)p - (span->start << PageShift);
 
        runtime·lock(&span->specialLock);
@@ -675,7 +794,7 @@ runtime·setprofilebucket(void *p, Bucket *b)
 // already been unlinked from the MSpan specials list.
 // Returns true if we should keep working on deallocating p.
 bool
-runtime·freespecial(Special *s, void *p, uintptr size)
+runtime·freespecial(Special *s, void *p, uintptr size, bool freed)
 {
        SpecialFinalizer *sf;
        SpecialProfile *sp;
@@ -690,7 +809,7 @@ runtime·freespecial(Special *s, void *p, uintptr size)
                return false; // don't free p until finalizer is done
        case KindSpecialProfile:
                sp = (SpecialProfile*)s;
-               runtime·MProf_Free(sp->b, p, size);
+               runtime·MProf_Free(sp->b, p, size, freed);
                runtime·lock(&runtime·mheap.speciallock);
                runtime·FixAlloc_Free(&runtime·mheap.specialprofilealloc, sp);
                runtime·unlock(&runtime·mheap.speciallock);
@@ -729,7 +848,7 @@ runtime·freeallspecials(MSpan *span, void *p, uintptr size)
        while(list != nil) {
                s = list;
                list = s->next;
-               if(!runtime·freespecial(s, p, size))
+               if(!runtime·freespecial(s, p, size, true))
                        runtime·throw("can't explicitly free an object with a finalizer");
        }
 }
index 321a2801fd9ad20d2a809761966e3b7b38793395..6eaecc6c2aa499f0c8adbf05e5e1cf7be7a12b2b 100644 (file)
@@ -33,14 +33,33 @@ struct Bucket
        {
                struct  // typ == MProf
                {
+                       // The following complex 3-stage scheme of stats accumulation
+                       // is required to obtain a consistent picture of mallocs and frees
+                       // for some point in time.
+                       // The problem is that mallocs come in real time, while frees
+                       // come only after a GC during concurrent sweeping. So if we would
+                       // naively count them, we would get a skew toward mallocs.
+                       //
+                       // Mallocs are accounted in recent stats.
+                       // Explicit frees are accounted in recent stats.
+                       // GC frees are accounted in prev stats.
+                       // After GC prev stats are added to final stats and
+                       // recent stats are moved into prev stats.
                        uintptr allocs;
                        uintptr frees;
                        uintptr alloc_bytes;
                        uintptr free_bytes;
-                       uintptr recent_allocs;  // since last gc
+
+                       uintptr prev_allocs;  // since last but one till last gc
+                       uintptr prev_frees;
+                       uintptr prev_alloc_bytes;
+                       uintptr prev_free_bytes;
+
+                       uintptr recent_allocs;  // since last gc till now
                        uintptr recent_frees;
                        uintptr recent_alloc_bytes;
                        uintptr recent_free_bytes;
+
                };
                struct  // typ == BProf
                {
@@ -117,10 +136,16 @@ MProf_GC(void)
        Bucket *b;
 
        for(b=mbuckets; b; b=b->allnext) {
-               b->allocs += b->recent_allocs;
-               b->frees += b->recent_frees;
-               b->alloc_bytes += b->recent_alloc_bytes;
-               b->free_bytes += b->recent_free_bytes;
+               b->allocs += b->prev_allocs;
+               b->frees += b->prev_frees;
+               b->alloc_bytes += b->prev_alloc_bytes;
+               b->free_bytes += b->prev_free_bytes;
+
+               b->prev_allocs = b->recent_allocs;
+               b->prev_frees = b->recent_frees;
+               b->prev_alloc_bytes = b->recent_alloc_bytes;
+               b->prev_free_bytes = b->recent_free_bytes;
+
                b->recent_allocs = 0;
                b->recent_frees = 0;
                b->recent_alloc_bytes = 0;
@@ -220,11 +245,16 @@ runtime·MProf_Malloc(void *p, uintptr size, uintptr typ)
 
 // Called when freeing a profiled block.
 void
-runtime·MProf_Free(Bucket *b, void *p, uintptr size)
+runtime·MProf_Free(Bucket *b, void *p, uintptr size, bool freed)
 {
        runtime·lock(&proflock);
-       b->recent_frees++;
-       b->recent_free_bytes += size;
+       if(freed) {
+               b->recent_frees++;
+               b->recent_free_bytes += size;
+       } else {
+               b->prev_frees++;
+               b->prev_free_bytes += size;
+       }
        if(runtime·debug.allocfreetrace) {
                runtime·printf("MProf_Free(p=%p, size=%p)\n", p, size);
                printstackframes(b->stk, b->nstk);
@@ -318,6 +348,7 @@ func MemProfile(p Slice, include_inuse_zero bool) (n int, ok bool) {
                // garbage collection is disabled from the beginning of execution,
                // accumulate stats as if a GC just happened, and recount buckets.
                MProf_GC();
+               MProf_GC();
                n = 0;
                for(b=mbuckets; b; b=b->allnext)
                        if(include_inuse_zero || b->alloc_bytes != b->free_bytes)