From: Jan Ziak <0xe2.0x9a.0x9b@gmail.com> Date: Mon, 17 Dec 2012 00:32:12 +0000 (-0500) Subject: runtime: struct Obj in mgc0.c and buffers in scanblock() X-Git-Tag: go1.1rc2~1617 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=013fa63c901e4a08548821bcc46393584b8c701e;p=gostls13.git runtime: struct Obj in mgc0.c and buffers in scanblock() Details: - This CL is the conceptual skeleton of code found in CL 6114046 - The garbage collector uses struct Obj to specify memory blocks - scanblock() is putting found memory blocks into an intermediate buffer (xbuf) before adding/flushing them to the main work buffer (wbuf) - The main loop in scanblock() is replaced with a skeleton code that in the future will be able to recognize the type of objects and thus will improve the garbage collector's precision. For now, all objects are simply sequences of pointers so the precision of the garbage collector remains unchanged. - The code plugs .gcdata and .gcbss sections into the garbage collector. scanblock() in this CL is unable to make any use of this. R=rsc, dvyukov, remyoudompheng CC=dave, golang-dev, minux.ma https://golang.org/cl/6856121 --- diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 32e030c518..a60684168b 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -8,16 +8,19 @@ #include "arch_GOARCH.h" #include "malloc.h" #include "stack.h" +#include "mgc0.h" #include "race.h" enum { Debug = 0, DebugMark = 0, // run second pass to check mark - DataBlock = 8*1024, // Four bits per word (see #defines below). wordsPerBitmapWord = sizeof(void*)*8/4, bitShift = sizeof(void*)*8/4, + + handoffThreshold = 4, + IntermediateBufferCapacity = 64, }; // Bits in per-word bitmap. @@ -70,12 +73,24 @@ uint32 runtime·worldsema = 1; static int32 gctrace; +typedef struct Obj Obj; +struct Obj +{ + byte *p; // data pointer + uintptr n; // size of data in bytes + uintptr ti; // type info +}; + +// The size of Workbuf is N*PageSize. typedef struct Workbuf Workbuf; struct Workbuf { - LFNode node; // must be first +#define SIZE (2*PageSize-sizeof(LFNode)-sizeof(uintptr)) + LFNode node; // must be first uintptr nobj; - byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)]; + Obj obj[SIZE/sizeof(Obj) - 1]; + uint8 _padding[SIZE%sizeof(Obj) + sizeof(Obj)]; +#undef SIZE }; typedef struct Finalizer Finalizer; @@ -97,9 +112,13 @@ struct FinBlock }; extern byte data[]; -extern byte etext[]; +extern byte edata[]; +extern byte bss[]; extern byte ebss[]; +extern byte gcdata[]; +extern byte gcbss[]; + static G *fing; static FinBlock *finq; // list of finalizers that are to be executed static FinBlock *finc; // cache of free blocks @@ -113,13 +132,6 @@ static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); static Workbuf* handoff(Workbuf*); -typedef struct GcRoot GcRoot; -struct GcRoot -{ - byte *p; - uintptr n; -}; - static struct { uint64 full; // lock-free list of full blocks uint64 empty; // lock-free list of empty blocks @@ -136,77 +148,122 @@ static struct { byte *chunk; uintptr nchunk; - GcRoot *roots; + Obj *roots; uint32 nroot; uint32 rootcap; } work; -// scanblock scans a block of n bytes starting at pointer b for references -// to other objects, scanning any it finds recursively until there are no -// unscanned objects left. Instead of using an explicit recursion, it keeps -// a work list in the Workbuf* structures and loops in the main function -// body. Keeping an explicit work list is easier on the stack allocator and -// more efficient. +enum { + // TODO(atom): to be expanded in a next CL + GC_DEFAULT_PTR = GC_NUM_INSTR, +}; + +// PtrTarget and BitTarget are structures used by intermediate buffers. +// The intermediate buffers hold GC data before it +// is moved/flushed to the work buffer (Workbuf). +// The size of an intermediate buffer is very small, +// such as 32 or 64 elements. +struct PtrTarget +{ + void *p; + uintptr ti; +}; + +struct BitTarget +{ + void *p; + uintptr ti; + uintptr *bitp, shift; +}; + +struct BufferList +{ + struct PtrTarget ptrtarget[IntermediateBufferCapacity]; + struct BitTarget bittarget[IntermediateBufferCapacity]; + struct BufferList *next; +}; +static struct BufferList *bufferList; + +static Lock lock; + +// flushptrbuf moves data from the PtrTarget buffer to the work buffer. +// The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned, +// while the work buffer contains blocks which have been marked +// and are prepared to be scanned by the garbage collector. +// +// _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer. +// bitbuf holds temporary data generated by this function. +// +// A simplified drawing explaining how the todo-list moves from a structure to another: +// +// scanblock +// (find pointers) +// Obj ------> PtrTarget (pointer targets) +// ↑ | +// | | flushptrbuf (1st part, +// | | find block start) +// | ↓ +// `--------- BitTarget (pointer targets and the corresponding locations in bitmap) +// flushptrbuf +// (2nd part, mark and enqueue) static void -scanblock(byte *b, uintptr n) +flushptrbuf(struct PtrTarget *ptrbuf, uintptr n, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj, struct BitTarget *bitbuf) { - byte *obj, *arena_start, *arena_used, *p; - void **vp; - uintptr size, *bitp, bits, shift, i, j, x, xbits, off, nobj, nproc; + byte *p, *arena_start, *obj; + uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti; MSpan *s; PageID k; - void **wp; + Obj *wp; Workbuf *wbuf; - bool keepworking; - - if((intptr)n < 0) { - runtime·printf("scanblock %p %D\n", b, (int64)n); - runtime·throw("scanblock"); - } + struct PtrTarget *ptrbuf_end; + struct BitTarget *bitbufpos, *bt; - // Memory arena parameters. arena_start = runtime·mheap.arena_start; - arena_used = runtime·mheap.arena_used; - nproc = work.nproc; - wbuf = nil; // current work buffer - wp = nil; // storage for next queued pointer (write pointer) - nobj = 0; // number of queued objects + wp = *_wp; + wbuf = *_wbuf; + nobj = *_nobj; - // Scanblock helpers pass b==nil. - // Procs needs to return to make more - // calls to scanblock. But if work.nproc==1 then - // might as well process blocks as soon as we - // have them. - keepworking = b == nil || work.nproc == 1; + ptrbuf_end = ptrbuf + n; - // Align b to a word boundary. - off = (uintptr)b & (PtrSize-1); - if(off != 0) { - b += PtrSize - off; - n -= PtrSize - off; + // If buffer is nearly full, get a new one. + if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) { + if(wbuf != nil) + wbuf->nobj = nobj; + wbuf = getempty(wbuf); + wp = wbuf->obj; + nobj = 0; + + if(n >= nelem(wbuf->obj)) + runtime·throw("ptrbuf has to be smaller than WorkBuf"); } - for(;;) { - // Each iteration scans the block b of length n, queueing pointers in - // the work buffer. - if(Debug > 1) - runtime·printf("scanblock %p %D\n", b, (int64)n); + // TODO(atom): This block is a branch of an if-then-else statement. + // The single-threaded branch may be added in a next CL. + { + // Multi-threaded version. - vp = (void**)b; - n >>= (2+PtrSize/8); /* n /= PtrSize (4 or 8) */ - for(i=0; i= arena_used) - continue; + while(ptrbuf < ptrbuf_end) { + obj = ptrbuf->p; + ti = ptrbuf->ti; + ptrbuf++; + + // obj belongs to interval [mheap.arena_start, mheap.arena_used). + if(Debug > 1) { + if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used) + runtime·throw("object is outside of mheap"); + } // obj may be a pointer to a live object. // Try to find the beginning of the object. // Round down to word boundary. - obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); + if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) { + obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); + ti = 0; + } // Find bits for this word. off = (uintptr*)obj - (uintptr*)arena_start; @@ -219,6 +276,8 @@ scanblock(byte *b, uintptr n) if((bits & (bitAllocated|bitBlockBoundary)) != 0) goto found; + ti = 0; + // Pointing just past the beginning? // Scan backward a little to find a block boundary. for(j=shift; j-->0; ) { @@ -239,13 +298,13 @@ scanblock(byte *b, uintptr n) s = runtime·mheap.map[x]; if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse) continue; - p = (byte*)((uintptr)s->start<start<sizeclass == 0) { obj = p; } else { if((byte*)obj >= (byte*)s->limit) continue; - size = runtime·class_to_size[s->sizeclass]; + size = s->elemsize; int32 i = ((byte*)obj - p)/size; obj = p+i*size; } @@ -258,81 +317,203 @@ scanblock(byte *b, uintptr n) bits = xbits >> shift; found: - // If another proc wants a pointer, give it some. - if(work.nwait > 0 && nobj > 4 && work.full == 0) { - wbuf->nobj = nobj; - wbuf = handoff(wbuf); - nobj = wbuf->nobj; - wp = wbuf->obj + nobj; - } - // Now we have bits, bitp, and shift correct for // obj pointing at the base of the object. // Only care about allocated and not marked. if((bits & (bitAllocated|bitMarked)) != bitAllocated) continue; - if(nproc == 1) - *bitp |= bitMarked<bitp; + bits = xbits >> bt->shift; + if((bits & bitMarked) != 0) + continue; + + // Mark the block + *bt->bitp = xbits | (bitMarked << bt->shift); // If object has no pointers, don't need to scan further. if((bits & bitNoPointers) != 0) continue; + obj = bt->p; + + // Ask span about size class. + // (Manually inlined copy of MHeap_Lookup.) + x = (uintptr)obj >> PageShift; + if(sizeof(void*) == 8) + x -= (uintptr)arena_start>>PageShift; + s = runtime·mheap.map[x]; + PREFETCH(obj); - // If buffer is full, get a new one. - if(wbuf == nil || nobj >= nelem(wbuf->obj)) { - if(wbuf != nil) - wbuf->nobj = nobj; - wbuf = getempty(wbuf); - wp = wbuf->obj; - nobj = 0; - } - *wp++ = obj; + *wp = (Obj){obj, s->elemsize, bt->ti}; + wp++; nobj++; - continue_obj:; } + runtime·unlock(&lock); + + // If another proc wants a pointer, give it some. + if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) { + wbuf->nobj = nobj; + wbuf = handoff(wbuf); + nobj = wbuf->nobj; + wp = wbuf->obj + nobj; + } + } + + *_wp = wp; + *_wbuf = wbuf; + *_nobj = nobj; +} + +// Program that scans the whole block and treats every block element as a potential pointer +static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR}; +// scanblock scans a block of n bytes starting at pointer b for references +// to other objects, scanning any it finds recursively until there are no +// unscanned objects left. Instead of using an explicit recursion, it keeps +// a work list in the Workbuf* structures and loops in the main function +// body. Keeping an explicit work list is easier on the stack allocator and +// more efficient. +// +// wbuf: current work buffer +// wp: storage for next queued pointer (write pointer) +// nobj: number of queued objects +static void +scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) +{ + byte *b, *arena_start, *arena_used; + uintptr n, i, end_b; + void *obj; + + // TODO(atom): to be expanded in a next CL + struct Frame {uintptr count, b; uintptr *loop_or_ret;}; + struct Frame stack_top; + + uintptr *pc; + + struct BufferList *scanbuffers; + struct PtrTarget *ptrbuf, *ptrbuf_end; + struct BitTarget *bitbuf; + + struct PtrTarget *ptrbufpos; + + // End of local variable declarations. + + if(sizeof(Workbuf) % PageSize != 0) + runtime·throw("scanblock: size of Workbuf is suboptimal"); + + // Memory arena parameters. + arena_start = runtime·mheap.arena_start; + arena_used = runtime·mheap.arena_used; + + // Allocate ptrbuf, bitbuf + { + runtime·lock(&lock); + + if(bufferList == nil) { + bufferList = runtime·SysAlloc(sizeof(*bufferList)); + bufferList->next = nil; + } + scanbuffers = bufferList; + bufferList = bufferList->next; + + ptrbuf = &scanbuffers->ptrtarget[0]; + ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget); + bitbuf = &scanbuffers->bittarget[0]; + + runtime·unlock(&lock); + } + + ptrbufpos = ptrbuf; + + goto next_block; + + for(;;) { + // Each iteration scans the block b of length n, queueing pointers in + // the work buffer. + if(Debug > 1) { + runtime·printf("scanblock %p %D\n", b, (int64)n); + } + + // TODO(atom): to be replaced in a next CL + pc = defaultProg; + + pc++; + stack_top.b = (uintptr)b; + + end_b = (uintptr)b + n - PtrSize; + + next_instr: + // TODO(atom): to be expanded in a next CL + switch(pc[0]) { + case GC_DEFAULT_PTR: + while(true) { + i = stack_top.b; + if(i > end_b) + goto next_block; + stack_top.b += PtrSize; + + obj = *(byte**)i; + if(obj >= arena_start && obj < arena_used) { + *ptrbufpos = (struct PtrTarget){obj, 0}; + ptrbufpos++; + if(ptrbufpos == ptrbuf_end) + goto flush_buffers; + } + } + + default: + runtime·throw("scanblock: invalid GC instruction"); + return; + } + + flush_buffers: + flushptrbuf(ptrbuf, ptrbufpos-ptrbuf, &wp, &wbuf, &nobj, bitbuf); + ptrbufpos = ptrbuf; + goto next_instr; + + next_block: // Done scanning [b, b+n). Prepare for the next iteration of - // the loop by setting b and n to the parameters for the next block. + // the loop by setting b, n to the parameters for the next block. - // Fetch b from the work buffer. if(nobj == 0) { - if(!keepworking) { - if(wbuf) - putempty(wbuf); - return; + flushptrbuf(ptrbuf, ptrbufpos-ptrbuf, &wp, &wbuf, &nobj, bitbuf); + ptrbufpos = ptrbuf; + + if(nobj == 0) { + if(!keepworking) { + if(wbuf) + putempty(wbuf); + goto endscan; + } + // Emptied our buffer: refill. + wbuf = getfull(wbuf); + if(wbuf == nil) + goto endscan; + nobj = wbuf->nobj; + wp = wbuf->obj + wbuf->nobj; } - // Emptied our buffer: refill. - wbuf = getfull(wbuf); - if(wbuf == nil) - return; - nobj = wbuf->nobj; - wp = wbuf->obj + wbuf->nobj; } - b = *--wp; - nobj--; - // Ask span about size class. - // (Manually inlined copy of MHeap_Lookup.) - x = (uintptr)b>>PageShift; - if(sizeof(void*) == 8) - x -= (uintptr)arena_start>>PageShift; - s = runtime·mheap.map[x]; - if(s->sizeclass == 0) - n = s->npages<sizeclass]; + // Fetch b from the work buffer. + --wp; + b = wp->p; + n = wp->n; + nobj--; } + +endscan: + runtime·lock(&lock); + scanbuffers->next = bufferList; + bufferList = scanbuffers; + runtime·unlock(&lock); } // debug_scanblock is the debug copy of scanblock. @@ -379,13 +560,12 @@ debug_scanblock(byte *b, uintptr n) continue; p = (byte*)((uintptr)s->start<elemsize; if(s->sizeclass == 0) { obj = p; - size = (uintptr)s->npages<= (byte*)s->limit) continue; - size = runtime·class_to_size[s->sizeclass]; int32 i = ((byte*)obj - p)/size; obj = p+i*size; } @@ -414,11 +594,74 @@ debug_scanblock(byte *b, uintptr n) } } +// Append obj to the work buffer. +// _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer. +static void +enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj) +{ + uintptr nobj, off; + Obj *wp; + Workbuf *wbuf; + + if(Debug > 1) + runtime·printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti); + + // Align obj.b to a word boundary. + off = (uintptr)obj.p & (PtrSize-1); + if(off != 0) { + obj.p += PtrSize - off; + obj.n -= PtrSize - off; + obj.ti = 0; + } + + if(obj.p == nil || obj.n == 0) + return; + + // Load work buffer state + wp = *_wp; + wbuf = *_wbuf; + nobj = *_nobj; + + // If another proc wants a pointer, give it some. + if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) { + wbuf->nobj = nobj; + wbuf = handoff(wbuf); + nobj = wbuf->nobj; + wp = wbuf->obj + nobj; + } + + // If buffer is full, get a new one. + if(wbuf == nil || nobj >= nelem(wbuf->obj)) { + if(wbuf != nil) + wbuf->nobj = nobj; + wbuf = getempty(wbuf); + wp = wbuf->obj; + nobj = 0; + } + + *wp = obj; + wp++; + nobj++; + + // Save work buffer state + *_wp = wp; + *_wbuf = wbuf; + *_nobj = nobj; +} + static void markroot(ParFor *desc, uint32 i) { + Obj *wp; + Workbuf *wbuf; + uintptr nobj; + USED(&desc); - scanblock(work.roots[i].p, work.roots[i].n); + wp = nil; + wbuf = nil; + nobj = 0; + enqueue(work.roots[i], &wbuf, &wp, &nobj); + scanblock(wbuf, wp, nobj, false); } // Get an empty work buffer off the work.empty list, @@ -508,25 +751,24 @@ handoff(Workbuf *b) } static void -addroot(byte *p, uintptr n) +addroot(Obj obj) { uint32 cap; - GcRoot *new; + Obj *new; if(work.nroot >= work.rootcap) { - cap = PageSize/sizeof(GcRoot); + cap = PageSize/sizeof(Obj); if(cap < 2*work.rootcap) cap = 2*work.rootcap; - new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot)); + new = (Obj*)runtime·SysAlloc(cap*sizeof(Obj)); if(work.roots != nil) { - runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot)); - runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot)); + runtime·memmove(new, work.roots, work.rootcap*sizeof(Obj)); + runtime·SysFree(work.roots, work.rootcap*sizeof(Obj)); } work.roots = new; work.rootcap = cap; } - work.roots[work.nroot].p = p; - work.roots[work.nroot].n = n; + work.roots[work.nroot] = obj; work.nroot++; } @@ -570,7 +812,7 @@ addstackroots(G *gp) runtime·printf("scanstack inconsistent: g%D#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); runtime·throw("scanstack"); } - addroot(sp, (byte*)stk - sp); + addroot((Obj){sp, (byte*)stk - sp, 0}); sp = (byte*)stk->gobuf.sp; guard = stk->stackguard; stk = (Stktop*)stk->stackbase; @@ -588,7 +830,7 @@ addfinroots(void *v) runtime·throw("mark - finalizer inconsistency"); // do not mark the finalizer block itself. just mark the things it points at. - addroot(v, size); + addroot((Obj){v, size, 0}); } static void @@ -596,15 +838,15 @@ addroots(void) { G *gp; FinBlock *fb; - byte *p; MSpan *s, **allspans; uint32 spanidx; work.nroot = 0; - // mark data+bss. - for(p=data; ptypes.data, sizeof(void*)); + // TODO(atom): consider using defaultProg instead of 0 + addroot((Obj){(byte*)&s->types.data, sizeof(void*), 0}); break; } } } + // stacks for(gp=runtime·allg; gp!=nil; gp=gp->alllink) { switch(gp->status){ default: @@ -646,7 +890,7 @@ addroots(void) runtime·walkfintab(addfinroots); for(fb=allfin; fb; fb=fb->alllink) - addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); + addroot((Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0}); } static bool @@ -887,8 +1131,9 @@ runtime·gchelper(void) { // parallel mark for over gc roots runtime·parfordo(work.markfor); + // help other threads scan secondary blocks - scanblock(nil, 0); + scanblock(nil, nil, 0, true); if(DebugMark) { // wait while the main thread executes mark(debug_scanblock) @@ -1050,26 +1295,27 @@ gc(struct gc_args *args) obj0 = mstats.nmalloc - mstats.nfree; } + m->locks++; // disable gc during mallocs in parforalloc + if(work.markfor == nil) + work.markfor = runtime·parforalloc(MaxGcproc); + if(work.sweepfor == nil) + work.sweepfor = runtime·parforalloc(MaxGcproc); + m->locks--; + work.nwait = 0; work.ndone = 0; work.debugmarkdone = 0; work.nproc = runtime·gcprocs(); addroots(); - m->locks++; // disable gc during mallocs in parforalloc - if(work.markfor == nil) - work.markfor = runtime·parforalloc(MaxGcproc); runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot); - if(work.sweepfor == nil) - work.sweepfor = runtime·parforalloc(MaxGcproc); runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan); - m->locks--; if(work.nproc > 1) { runtime·noteclear(&work.alldone); runtime·helpgc(work.nproc); } runtime·parfordo(work.markfor); - scanblock(nil, 0); + scanblock(nil, nil, 0, true); if(DebugMark) { for(i=0; i