]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: parallelize garbage collector mark + sweep
authorRuss Cox <rsc@golang.org>
Fri, 30 Sep 2011 13:40:01 +0000 (09:40 -0400)
committerRuss Cox <rsc@golang.org>
Fri, 30 Sep 2011 13:40:01 +0000 (09:40 -0400)
Running test/garbage/parser.out.

On a 4-core Lenovo X201s (Linux):
31.12u 0.60s 31.74r   1 cpu, no atomics
32.27u 0.58s 32.86r   1 cpu, atomic instructions
33.04u 0.83s 27.47r   2 cpu

On a 16-core Xeon (Linux):
33.08u 0.65s 33.80r   1 cpu, no atomics
34.87u 1.12s 29.60r   2 cpu
36.00u 1.87s 28.43r   3 cpu
36.46u 2.34s 27.10r   4 cpu
38.28u 3.85s 26.92r   5 cpu
37.72u 5.25s 26.73r  6 cpu
39.63u 7.11s 26.95r  7 cpu
39.67u 8.10s 26.68r  8 cpu

On a 2-core MacBook Pro Core 2 Duo 2.26 (circa 2009, MacBookPro5,5):
39.43u 1.45s 41.27r   1 cpu, no atomics
43.98u 2.95s 38.69r   2 cpu

On a 2-core Mac Mini Core 2 Duo 1.83 (circa 2008; Macmini2,1):
48.81u 2.12s 51.76r   1 cpu, no atomics
57.15u 4.72s 51.54r   2 cpu

The handoff algorithm is really only good for two cores.
Beyond that we will need to so something more sophisticated,
like have each core hand off to the next one, around a circle.
Even so, the code is a good checkpoint; for now we'll limit the
number of gc procs to at most 2.

R=dvyukov
CC=golang-dev
https://golang.org/cl/4641082

15 files changed:
src/pkg/runtime/darwin/386/sys.s
src/pkg/runtime/darwin/amd64/sys.s
src/pkg/runtime/darwin/os.h
src/pkg/runtime/darwin/thread.c
src/pkg/runtime/linux/386/sys.s
src/pkg/runtime/linux/amd64/sys.s
src/pkg/runtime/linux/arm/sys.s
src/pkg/runtime/linux/thread.c
src/pkg/runtime/malloc.h
src/pkg/runtime/mgc0.c
src/pkg/runtime/print.c
src/pkg/runtime/proc.c
src/pkg/runtime/runtime.h
test/garbage/Makefile
test/garbage/parser.go

index 87fbdbb79ec249e603c7f60f2ed31aa53a5d0103..15eaf93bc3e8a722265dd94576f2db5581c88134 100644 (file)
@@ -97,7 +97,7 @@ TEXT runtime·sigtramp(SB),7,$40
        // save g
        MOVL    g(CX), DI
        MOVL    DI, 20(SP)
-       
+
        // g = m->gsignal
        MOVL    m(CX), BP
        MOVL    m_gsignal(BP), BP
@@ -111,7 +111,7 @@ TEXT runtime·sigtramp(SB),7,$40
        MOVL    context+16(FP), BX
        MOVL    BX, 8(SP)
        MOVL    DI, 12(SP)
-       
+
        MOVL    handler+0(FP), BX
        CALL    BX
 
@@ -138,6 +138,26 @@ TEXT runtime·sigaltstack(SB),7,$0
        CALL    runtime·notok(SB)
        RET
 
+TEXT runtime·usleep(SB),7,$32
+       MOVL    $0, DX
+       MOVL    usec+0(FP), AX
+       MOVL    $1000000, CX
+       DIVL    CX
+       MOVL    AX, 24(SP)  // sec
+       MOVL    DX, 28(SP)  // usec
+
+       // select(0, 0, 0, 0, &tv)
+       MOVL    $0, 0(SP)  // "return PC" - ignored
+       MOVL    $0, 4(SP)
+       MOVL    $0, 8(SP)
+       MOVL    $0, 12(SP)
+       MOVL    $0, 16(SP)
+       LEAL    24(SP), AX
+       MOVL    AX, 20(SP)
+       MOVL    $93, AX
+       INT     $0x80
+       RET
+
 // void bsdthread_create(void *stk, M *m, G *g, void (*fn)(void))
 // System call args are: func arg stack pthread flags.
 TEXT runtime·bsdthread_create(SB),7,$32
@@ -309,3 +329,12 @@ TEXT runtime·setldt(SB),7,$32
        XORL    AX, AX
        MOVW    GS, AX
        RET
+
+TEXT runtime·sysctl(SB),7,$0
+       MOVL    $202, AX
+       INT     $0x80
+       JAE     3(PC)
+       NEGL    AX
+       RET
+       MOVL    $0, AX
+       RET
index 8d1b20f11899e8ed0cc254b757ec8bd94efd86dd..7c79f18c49aa6929509d326a67d3951700783d8c 100644 (file)
@@ -81,11 +81,11 @@ TEXT runtime·sigaction(SB),7,$0
 
 TEXT runtime·sigtramp(SB),7,$64
        get_tls(BX)
-       
+
        // save g
        MOVQ    g(BX), R10
        MOVQ    R10, 48(SP)
-       
+
        // g = m->gsignal
        MOVQ    m(BX), BP
        MOVQ    m_gsignal(BP), BP
@@ -146,6 +146,24 @@ TEXT runtime·sigaltstack(SB),7,$0
        CALL    runtime·notok(SB)
        RET
 
+TEXT runtime·usleep(SB),7,$16
+       MOVL    $0, DX
+       MOVL    usec+0(FP), AX
+       MOVL    $1000000, CX
+       DIVL    CX
+       MOVQ    AX, 0(SP)  // sec
+       MOVL    DX, 8(SP)  // usec
+
+       // select(0, 0, 0, 0, &tv)
+       MOVL    $0, DI
+       MOVL    $0, SI
+       MOVL    $0, DX
+       MOVL    $0, R10
+       MOVQ    SP, R8
+       MOVL    $(0x2000000+23), AX
+       SYSCALL
+       RET
+
 // void bsdthread_create(void *stk, M *m, G *g, void (*fn)(void))
 TEXT runtime·bsdthread_create(SB),7,$0
        // Set up arguments to bsdthread_create system call.
@@ -189,7 +207,7 @@ TEXT runtime·bsdthread_start(SB),7,$0
        POPQ    SI
        POPQ    CX
        POPQ    DX
-       
+
        get_tls(BX)
        MOVQ    CX, m(BX)
        MOVQ    SI, m_procid(CX)        // thread port is m->procid
@@ -293,3 +311,18 @@ TEXT runtime·settls(SB),7,$32
        MOVL    $(0x3000000+3), AX      // thread_fast_set_cthread_self - machdep call #3
        SYSCALL
        RET
+
+TEXT runtime·sysctl(SB),7,$0
+       MOVQ    8(SP), DI
+       MOVL    16(SP), SI
+       MOVQ    24(SP), DX
+       MOVQ    32(SP), R10
+       MOVQ    40(SP), R8
+       MOVQ    48(SP), R9
+       MOVL    $(0x2000000+202), AX    // syscall entry
+       SYSCALL
+       JCC 3(PC)
+       NEGL    AX
+       RET
+       MOVL    $0, AX
+       RET
index db3c2e8a7c6fc9a802fce8899bca6a7f880a93bd..37160f779ce82a68d3c82f6528f26cb9bd9bea4a 100644 (file)
@@ -18,6 +18,7 @@ uint32        runtime·mach_task_self(void);
 uint32 runtime·mach_task_self(void);
 uint32 runtime·mach_thread_self(void);
 uint32 runtime·mach_thread_self(void);
+int32  runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
 
 struct Sigaction;
 void   runtime·sigaction(uintptr, struct Sigaction*, struct Sigaction*);
index 6733e815e8163426043fe7773b1c4af28416bfb6..c5d8ba4d3e6621c1bec025cfa2652d03552f9070 100644 (file)
@@ -148,6 +148,20 @@ runtime·osinit(void)
        if(!runtime·iscgo)
                runtime·bsdthread_register();
        runtime·destroylock = destroylock;
+
+       // Use sysctl to fetch hw.ncpu.
+       uint32 mib[2];
+       uint32 out;
+       int32 ret;
+       uintptr nout;
+
+       mib[0] = 6;
+       mib[1] = 3;
+       nout = sizeof out;
+       out = 0;
+       ret = runtime·sysctl(mib, 2, (byte*)&out, &nout, nil, 0);
+       if(ret >= 0)
+               runtime·ncpu = out;
 }
 
 void
index f87420f788c6dd2657ae9edd60cbeeb231e17442..7549c04798ca93b77c6f09244b1d834a8b8c80ea 100644 (file)
@@ -52,6 +52,25 @@ TEXT runtime·read(SB),7,$0
        CALL    *runtime·_vdso(SB)
        RET
 
+TEXT runtime·usleep(SB),7,$28
+       MOVL    $0, DX
+       MOVL    usec+0(FP), AX
+       MOVL    $1000000, CX
+       DIVL    CX
+       MOVL    AX, 20(SP)
+       MOVL    DX, 24(SP)
+
+       // select(0, 0, 0, 0, &tv)
+       MOVL    $0, 0(SP)
+       MOVL    $0, 4(SP)
+       MOVL    $0, 8(SP)
+       MOVL    $0, 12(SP)
+       LEAL    20(SP), AX
+       MOVL    AX, 16(SP)
+       MOVL    $82, AX
+       SYSCALL
+       RET
+
 TEXT runtime·raisesigpipe(SB),7,$12
        MOVL    $224, AX        // syscall - gettid
        CALL    *runtime·_vdso(SB)
@@ -105,16 +124,16 @@ TEXT runtime·rt_sigaction(SB),7,$0
 
 TEXT runtime·sigtramp(SB),7,$44
        get_tls(CX)
-       
+
        // save g
        MOVL    g(CX), DI
        MOVL    DI, 20(SP)
-       
+
        // g = m->gsignal
        MOVL    m(CX), BX
        MOVL    m_gsignal(BX), BX
        MOVL    BX, g(CX)
-       
+
        // copy arguments for call to sighandler
        MOVL    sig+0(FP), BX
        MOVL    BX, 0(SP)
@@ -125,12 +144,12 @@ TEXT runtime·sigtramp(SB),7,$44
        MOVL    DI, 12(SP)
 
        CALL    runtime·sighandler(SB)
-       
+
        // restore g
        get_tls(CX)
        MOVL    20(SP), BX
        MOVL    BX, g(CX)
-       
+
        RET
 
 TEXT runtime·sigignore(SB),7,$0
@@ -202,7 +221,7 @@ TEXT runtime·clone(SB),7,$0
        MOVL    $1234, 12(CX)
 
        // cannot use CALL *runtime·_vdso(SB) here, because
-       // the stack changes during the system call (after 
+       // the stack changes during the system call (after
        // CALL *runtime·_vdso(SB), the child is still using
        // the parent's stack when executing its RET instruction).
        INT     $0x80
index 8b4dcd921e8fdb010dc9bb9416bf50a6b9ebda89..3174af2cb0599f23bfe3ae609202f414e41365b8 100644 (file)
@@ -50,6 +50,24 @@ TEXT runtime·read(SB),7,$0-24
        SYSCALL
        RET
 
+TEXT runtime·usleep(SB),7,$16
+       MOVL    $0, DX
+       MOVL    usec+0(FP), AX
+       MOVL    $1000000, CX
+       DIVL    CX
+       MOVQ    AX, 0(SP)
+       MOVQ    DX, 8(SP)
+
+       // select(0, 0, 0, 0, &tv)
+       MOVL    $0, DI
+       MOVL    $0, SI
+       MOVL    $0, DX
+       MOVL    $0, R10
+       MOVQ    SP, R8
+       MOVL    $23, AX
+       SYSCALL
+       RET
+
 TEXT runtime·raisesigpipe(SB),7,$12
        MOVL    $186, AX        // syscall - gettid
        SYSCALL
@@ -195,10 +213,10 @@ TEXT runtime·clone(SB),7,$0
        CMPQ    AX, $0
        JEQ     2(PC)
        RET
-       
+
        // In child, on new stack.
        MOVQ    SI, SP
-       
+
        // Initialize m->procid to Linux tid
        MOVL    $186, AX        // gettid
        SYSCALL
index 8619f0945caa99194f88f714443a753c5b2f21fc..764e779fdd3e448e7ddea7c9eb30504ecf9062a1 100644 (file)
@@ -33,6 +33,7 @@
 #define SYS_gettid (SYS_BASE + 224)
 #define SYS_tkill (SYS_BASE + 238)
 #define SYS_sched_yield (SYS_BASE + 158)
+#define SYS_select (SYS_BASE + 82)
 
 #define ARM_BASE (SYS_BASE + 0x0f0000)
 #define SYS_ARM_cacheflush (ARM_BASE + 2)
@@ -254,7 +255,7 @@ TEXT runtime·sigtramp(SB),7,$24
        // save g
        MOVW    g, R3
        MOVW    g, 20(R13)
-       
+
        // g = m->gsignal
        MOVW    m_gsignal(m), g
 
@@ -265,7 +266,7 @@ TEXT runtime·sigtramp(SB),7,$24
        MOVW    R3, 16(R13)
 
        BL      runtime·sighandler(SB)
-       
+
        // restore g
        MOVW    20(R13), g
 
@@ -285,6 +286,23 @@ TEXT runtime·sigreturn(SB),7,$0
        SWI     $0
        RET
 
+TEXT runtime·usleep(SB),7,$12
+       MOVW    usec+0(FP), R0
+       MOVW    R0, R1
+       MOVW    $1000000, R2
+       DIV     R1, R0
+       MOD     R2, R0
+       MOVW    R1, 4(SP)
+       MOVW    R2, 8(SP)
+       MOVW    $0, R0
+       MOVW    $0, R1
+       MOVW    $0, R2
+       MOVW    $0, R3
+       MOVW    $4(SP), R4
+       MOVW    $SYS_select, R7
+       SWI     $0
+       RET
+
 // Use kernel version instead of native armcas in ../../arm.s.
 // See ../../../sync/atomic/asm_linux_arm.s for details.
 TEXT cas<>(SB),7,$0
index 4878a00f25eb6d35bfa22a86abe828d3066aef18..bf3b0947d66a74f4dc74cc16a0970560db359c9f 100644 (file)
@@ -8,7 +8,6 @@
 #include "stack.h"
 
 extern SigTab runtime·sigtab[];
-static int32 proccount;
 
 int32 runtime·open(uint8*, int32, int32);
 int32 runtime·close(int32);
@@ -136,13 +135,10 @@ futexlock(Lock *l)
        // its wakeup call.
        wait = v;
 
-       if(proccount == 0)
-               proccount = getproccount();
-
        // On uniprocessor's, no point spinning.
        // On multiprocessors, spin for ACTIVE_SPIN attempts.
        spin = 0;
-       if(proccount > 1)
+       if(runtime·ncpu > 1)
                spin = ACTIVE_SPIN;
 
        for(;;) {
@@ -276,6 +272,7 @@ runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
 void
 runtime·osinit(void)
 {
+       runtime·ncpu = getproccount();
 }
 
 void
index 5bc80f4df954b6c56271e76f8829ab0c28cd57da..f22cae4b0512a746259c04970991916bba37fa86 100644 (file)
@@ -120,6 +120,13 @@ enum
 #else
        MHeapMap_Bits = 20,
 #endif
+
+       // Max number of threads to run garbage collection.
+       // 2, 3, and 4 are all plausible maximums depending
+       // on the hardware details of the machine.  The second
+       // proc is the one that helps the most (after the first),
+       // so start with just 2 for now.
+       MaxGcproc = 2,
 };
 
 // A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
@@ -192,7 +199,7 @@ struct MStats
        uint64  nlookup;        // number of pointer lookups
        uint64  nmalloc;        // number of mallocs
        uint64  nfree;  // number of frees
-       
+
        // Statistics about malloc heap.
        // protected by mheap.Lock
        uint64  heap_alloc;     // bytes allocated and still in use
@@ -210,7 +217,7 @@ struct MStats
        uint64  mcache_inuse;   // MCache structures
        uint64  mcache_sys;
        uint64  buckhash_sys;   // profiling bucket hash table
-       
+
        // Statistics about garbage collector.
        // Protected by stopping the world during GC.
        uint64  next_gc;        // next GC (in heap_alloc time)
@@ -219,7 +226,7 @@ struct MStats
        uint32  numgc;
        bool    enablegc;
        bool    debuggc;
-       
+
        // Statistics about allocation size classes.
        struct {
                uint32 size;
@@ -240,7 +247,7 @@ extern MStats mstats;
 //
 // class_to_size[i] = largest size in class i
 // class_to_allocnpages[i] = number of pages to allocate when
-//     making new objects in class i
+//     making new objects in class i
 // class_to_transfercount[i] = number of objects to move when
 //     taking a bunch of objects out of the central lists
 //     and putting them in the thread free list.
@@ -279,7 +286,7 @@ struct MCache
                int64 nmalloc;
                int64 nfree;
        } local_by_size[NumSizeClasses];
-       
+
 };
 
 void*  runtime·MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed);
@@ -352,7 +359,7 @@ struct MHeap
        byte *arena_start;
        byte *arena_used;
        byte *arena_end;
-       
+
        // central free lists for small size classes.
        // the union makes sure that the MCentrals are
        // spaced 64 bytes apart, so that each MCentral.Lock
@@ -400,6 +407,8 @@ enum
 
 void   runtime·MProf_Malloc(void*, uintptr);
 void   runtime·MProf_Free(void*, uintptr);
+int32  runtime·helpgc(void);
+void   runtime·gchelper(void);
 
 // Malloc profiling settings.
 // Must match definition in extern.go.
index 03d6f7d62951212d891ab9e5d7bb4064190be72f..eaa056da0b3f2f36eabf68d1c92280fbef9a11b4 100644 (file)
@@ -10,9 +10,9 @@
 
 enum {
        Debug = 0,
-       UseCas = 1,
        PtrSize = sizeof(void*),
-       
+       DebugMark = 0,  // run second pass to check mark
+
        // Four bits per word (see #defines below).
        wordsPerBitmapWord = sizeof(void*)*8/4,
        bitShift = sizeof(void*)*8/4,
@@ -51,17 +51,20 @@ enum {
 
 #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial)
 
+// TODO: Make these per-M.
 static uint64 nlookup;
 static uint64 nsizelookup;
 static uint64 naddrlookup;
+static uint64 nhandoff;
+
 static int32 gctrace;
 
 typedef struct Workbuf Workbuf;
 struct Workbuf
 {
        Workbuf *next;
-       uintptr nw;
-       byte *w[2048-2];
+       uintptr nobj;
+       byte *obj[512-2];
 };
 
 extern byte data[];
@@ -75,6 +78,26 @@ static int32 fingwait;
 static void runfinq(void);
 static Workbuf* getempty(Workbuf*);
 static Workbuf* getfull(Workbuf*);
+static void    putempty(Workbuf*);
+static Workbuf* handoff(Workbuf*);
+
+static struct {
+       Lock fmu;
+       Workbuf *full;
+       Lock emu;
+       Workbuf *empty;
+       uint32  nproc;
+       volatile uint32 nwait;
+       volatile uint32 ndone;
+       Note    alldone;
+       Lock    markgate;
+       Lock    sweepgate;
+       MSpan   *spans;
+
+       Lock;
+       byte    *chunk;
+       uintptr nchunk;
+} work;
 
 // scanblock scans a block of n bytes starting at pointer b for references
 // to other objects, scanning any it finds recursively until there are no
@@ -85,13 +108,14 @@ static Workbuf* getfull(Workbuf*);
 static void
 scanblock(byte *b, int64 n)
 {
-       byte *obj, *arena_start, *p;
+       byte *obj, *arena_start, *arena_used, *p;
        void **vp;
-       uintptr size, *bitp, bits, shift, i, j, x, xbits, off;
+       uintptr size, *bitp, bits, shift, i, j, x, xbits, off, nobj, nproc;
        MSpan *s;
        PageID k;
-       void **bw, **w, **ew;
+       void **wp;
        Workbuf *wbuf;
+       bool keepworking;
 
        if((int64)(uintptr)n != n || n < 0) {
                runtime·printf("scanblock %p %D\n", b, n);
@@ -100,11 +124,19 @@ scanblock(byte *b, int64 n)
 
        // Memory arena parameters.
        arena_start = runtime·mheap.arena_start;
-       
+       arena_used = runtime·mheap.arena_used;
+       nproc = work.nproc;
+
        wbuf = nil;  // current work buffer
-       ew = nil;  // end of work buffer
-       bw = nil;  // beginning of work buffer
-       w = nil;  // current pointer into work buffer
+       wp = nil;  // storage for next queued pointer (write pointer)
+       nobj = 0;  // number of queued objects
+
+       // Scanblock helpers pass b==nil.
+       // The main proc needs to return to make more
+       // calls to scanblock.  But if work.nproc==1 then
+       // might as well process blocks as soon as we
+       // have them.
+       keepworking = b == nil || work.nproc == 1;
 
        // Align b to a word boundary.
        off = (uintptr)b & (PtrSize-1);
@@ -120,17 +152,17 @@ scanblock(byte *b, int64 n)
                        runtime·printf("scanblock %p %D\n", b, n);
 
                vp = (void**)b;
-               n /= PtrSize;
+               n >>= (2+PtrSize/8);  /* n /= PtrSize (4 or 8) */
                for(i=0; i<n; i++) {
                        obj = (byte*)vp[i];
-                       
+
                        // Words outside the arena cannot be pointers.
-                       if((byte*)obj < arena_start || (byte*)obj >= runtime·mheap.arena_used)
+                       if((byte*)obj < arena_start || (byte*)obj >= arena_used)
                                continue;
-                       
+
                        // obj may be a pointer to a live object.
                        // Try to find the beginning of the object.
-                       
+
                        // Round down to word boundary.
                        obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
 
@@ -188,47 +220,72 @@ scanblock(byte *b, int64 n)
                found:
                        // Now we have bits, bitp, and shift correct for
                        // obj pointing at the base of the object.
-                       // If not allocated or already marked, done.
-                       if((bits & bitAllocated) == 0 || (bits & bitMarked) != 0)
+                       // Only care about allocated and not marked.
+                       if((bits & (bitAllocated|bitMarked)) != bitAllocated)
                                continue;
-                       *bitp |= bitMarked<<shift;
+                       if(nproc == 1)
+                               *bitp |= bitMarked<<shift;
+                       else {
+                               for(;;) {
+                                       x = *bitp;
+                                       if(x & (bitMarked<<shift))
+                                               goto continue_obj;
+                                       if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
+                                               break;
+                               }
+                       }
 
                        // If object has no pointers, don't need to scan further.
                        if((bits & bitNoPointers) != 0)
                                continue;
 
+                       // If another proc wants a pointer, give it some.
+                       if(nobj > 4 && work.nwait > 0 && work.full == nil) {
+                               wbuf->nobj = nobj;
+                               wbuf = handoff(wbuf);
+                               nobj = wbuf->nobj;
+                               wp = wbuf->obj + nobj;
+                       }
+
                        // If buffer is full, get a new one.
-                       if(w >= ew) {
+                       if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
+                               if(wbuf != nil)
+                                       wbuf->nobj = nobj;
                                wbuf = getempty(wbuf);
-                               bw = wbuf->w;
-                               w = bw;
-                               ew = bw + nelem(wbuf->w);
+                               wp = wbuf->obj;
+                               nobj = 0;
                        }
-                       *w++ = obj;
+                       *wp++ = obj;
+                       nobj++;
+               continue_obj:;
                }
-               
+
                // Done scanning [b, b+n).  Prepare for the next iteration of
                // the loop by setting b and n to the parameters for the next block.
 
-               // Fetch b from the work buffers.
-               if(w <= bw) {
+               // Fetch b from the work buffer.
+               if(nobj == 0) {
+                       if(!keepworking) {
+                               putempty(wbuf);
+                               return;
+                       }
                        // Emptied our buffer: refill.
                        wbuf = getfull(wbuf);
                        if(wbuf == nil)
-                               break;
-                       bw = wbuf->w;
-                       ew = wbuf->w + nelem(wbuf->w);
-                       w = bw+wbuf->nw;
+                               return;
+                       nobj = wbuf->nobj;
+                       wp = wbuf->obj + wbuf->nobj;
                }
-               b = *--w;
-       
+               b = *--wp;
+               nobj--;
+
                // Figure out n = size of b.  Start by loading bits for b.
                off = (uintptr*)b - (uintptr*)arena_start;
                bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
                shift = off % wordsPerBitmapWord;
                xbits = *bitp;
                bits = xbits >> shift;
-               
+
                // Might be small; look for nearby block boundary.
                // A block boundary is marked by either bitBlockBoundary
                // or bitAllocated being set (see notes near their definition).
@@ -247,12 +304,12 @@ scanblock(byte *b, int64 n)
                // apply a mask to keep only the bits corresponding
                // to shift+j < bitShift aka j < bitShift-shift.
                bits &= (boundary<<(bitShift-shift)) - boundary;
-               
+
                // A block boundary j words before b is indicated by
                //      xbits>>(shift-j) & boundary
                // (assuming shift >= j).  There is no cleverness here
                // avoid the test, because when j gets too large the shift
-               // turns negative, which is undefined in C.             
+               // turns negative, which is undefined in C.
 
                for(j=1; j<bitShift; j++) {
                        if(((bits>>j)&boundary) != 0 || shift>=j && ((xbits>>(shift-j))&boundary) != 0) {
@@ -260,7 +317,7 @@ scanblock(byte *b, int64 n)
                                goto scan;
                        }
                }
-               
+
                // Fall back to asking span about size class.
                // (Manually inlined copy of MHeap_Lookup.)
                nlookup++;
@@ -277,29 +334,123 @@ scanblock(byte *b, int64 n)
        }
 }
 
-static struct {
-       Workbuf *full;
-       Workbuf *empty;
-       byte    *chunk;
-       uintptr nchunk;
-} work;
+// debug_scanblock is the debug copy of scanblock.
+// it is simpler, slower, single-threaded, recursive,
+// and uses bitSpecial as the mark bit.
+static void
+debug_scanblock(byte *b, int64 n)
+{
+       byte *obj, *p;
+       void **vp;
+       uintptr size, *bitp, bits, shift, i, xbits, off;
+       MSpan *s;
+
+       if(!DebugMark)
+               runtime·throw("debug_scanblock without DebugMark");
+
+       if((int64)(uintptr)n != n || n < 0) {
+               runtime·printf("debug_scanblock %p %D\n", b, n);
+               runtime·throw("debug_scanblock");
+       }
+
+       // Align b to a word boundary.
+       off = (uintptr)b & (PtrSize-1);
+       if(off != 0) {
+               b += PtrSize - off;
+               n -= PtrSize - off;
+       }
+
+       vp = (void**)b;
+       n /= PtrSize;
+       for(i=0; i<n; i++) {
+               obj = (byte*)vp[i];
+
+               // Words outside the arena cannot be pointers.
+               if((byte*)obj < runtime·mheap.arena_start || (byte*)obj >= runtime·mheap.arena_used)
+                       continue;
+
+               // Round down to word boundary.
+               obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
+
+               // Consult span table to find beginning.
+               s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj);
+               if(s == nil)
+                       continue;
+
+
+               p =  (byte*)((uintptr)s->start<<PageShift);
+               if(s->sizeclass == 0) {
+                       obj = p;
+                       size = (uintptr)s->npages<<PageShift;
+               } else {
+                       if((byte*)obj >= (byte*)s->limit)
+                               continue;
+                       size = runtime·class_to_size[s->sizeclass];
+                       int32 i = ((byte*)obj - p)/size;
+                       obj = p+i*size;
+               }
+
+               // Now that we know the object header, reload bits.
+               off = (uintptr*)obj - (uintptr*)runtime·mheap.arena_start;
+               bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+               shift = off % wordsPerBitmapWord;
+               xbits = *bitp;
+               bits = xbits >> shift;
+
+               // Now we have bits, bitp, and shift correct for
+               // obj pointing at the base of the object.
+               // If not allocated or already marked, done.
+               if((bits & bitAllocated) == 0 || (bits & bitSpecial) != 0)  // NOTE: bitSpecial not bitMarked
+                       continue;
+               *bitp |= bitSpecial<<shift;
+               if(!(bits & bitMarked))
+                       runtime·printf("found unmarked block %p in %p\n", obj, vp+i);
+
+               // If object has no pointers, don't need to scan further.
+               if((bits & bitNoPointers) != 0)
+                       continue;
+
+               debug_scanblock(obj, size);
+       }
+}
 
 // Get an empty work buffer off the work.empty list,
 // allocating new buffers as needed.
 static Workbuf*
 getempty(Workbuf *b)
 {
-       if(b != nil) {
-               b->nw = nelem(b->w);
-               b->next = work.full;
-               work.full = b;
-       }
-       b = work.empty;
-       if(b != nil) {
-               work.empty = b->next;
-               return b;
+       if(work.nproc == 1) {
+               // Put b on full list.
+               if(b != nil) {
+                       b->next = work.full;
+                       work.full = b;
+               }
+               // Grab from empty list if possible.
+               b = work.empty;
+               if(b != nil) {
+                       work.empty = b->next;
+                       goto haveb;
+               }
+       } else {
+               // Put b on full list.
+               if(b != nil) {
+                       runtime·lock(&work.fmu);
+                       b->next = work.full;
+                       work.full = b;
+                       runtime·unlock(&work.fmu);
+               }
+               // Grab from empty list if possible.
+               runtime·lock(&work.emu);
+               b = work.empty;
+               if(b != nil)
+                       work.empty = b->next;
+               runtime·unlock(&work.emu);
+               if(b != nil)
+                       goto haveb;
        }
-       
+
+       // Need to allocate.
+       runtime·lock(&work);
        if(work.nchunk < sizeof *b) {
                work.nchunk = 1<<20;
                work.chunk = runtime·SysAlloc(work.nchunk);
@@ -307,27 +458,122 @@ getempty(Workbuf *b)
        b = (Workbuf*)work.chunk;
        work.chunk += sizeof *b;
        work.nchunk -= sizeof *b;
+       runtime·unlock(&work);
+
+haveb:
+       b->nobj = 0;
        return b;
 }
 
+static void
+putempty(Workbuf *b)
+{
+       if(b == nil)
+               return;
+
+       if(work.nproc == 1) {
+               b->next = work.empty;
+               work.empty = b;
+               return;
+       }
+
+       runtime·lock(&work.emu);
+       b->next = work.empty;
+       work.empty = b->next;
+       runtime·unlock(&work.emu);
+}
+
 // Get a full work buffer off the work.full list, or return nil.
 static Workbuf*
 getfull(Workbuf *b)
 {
-       if(b != nil) {
-               b->nw = 0;
-               b->next = work.empty;
-               work.empty = b;
+       int32 i;
+       Workbuf *b1;
+
+       if(work.nproc == 1) {
+               // Put b on empty list.
+               if(b != nil) {
+                       b->next = work.empty;
+                       work.empty = b;
+               }
+               // Grab from full list if possible.
+               // Since work.nproc==1, no one else is
+               // going to give us work.
+               b = work.full;
+               if(b != nil)
+                       work.full = b->next;
+               return b;
+       }
+
+       putempty(b);
+
+       // Grab buffer from full list if possible.
+       for(;;) {
+               b1 = work.full;
+               if(b1 == nil)
+                       break;
+               runtime·lock(&work.fmu);
+               if(work.full != nil) {
+                       b1 = work.full;
+                       work.full = b1->next;
+                       runtime·unlock(&work.fmu);
+                       return b1;
+               }
+               runtime·unlock(&work.fmu);
+       }
+
+       runtime·xadd(&work.nwait, +1);
+       for(i=0;; i++) {
+               b1 = work.full;
+               if(b1 != nil) {
+                       runtime·lock(&work.fmu);
+                       if(work.full != nil) {
+                               runtime·xadd(&work.nwait, -1);
+                               b1 = work.full;
+                               work.full = b1->next;
+                               runtime·unlock(&work.fmu);
+                               return b1;
+                       }
+                       runtime·unlock(&work.fmu);
+                       continue;
+               }
+               if(work.nwait == work.nproc)
+                       return nil;
+               if(i < 10)
+                       runtime·procyield(20);
+               else if(i < 20)
+                       runtime·osyield();
+               else
+                       runtime·usleep(100);
        }
-       b = work.full;
-       if(b != nil)
-               work.full = b->next;
-       return b;
+}
+
+static Workbuf*
+handoff(Workbuf *b)
+{
+       int32 n;
+       Workbuf *b1;
+
+       // Make new buffer with half of b's pointers.
+       b1 = getempty(nil);
+       n = b->nobj/2;
+       b->nobj -= n;
+       b1->nobj = n;
+       runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
+       nhandoff += n;
+
+       // Put b on full list - let first half of b get stolen.
+       runtime·lock(&work.fmu);
+       b->next = work.full;
+       work.full = b;
+       runtime·unlock(&work.fmu);
+
+       return b1;
 }
 
 // Scanstack calls scanblock on each of gp's stack segments.
 static void
-scanstack(G *gp)
+scanstack(void (*scanblock)(byte*, int64), G *gp)
 {
        int32 n;
        Stktop *stk;
@@ -339,6 +585,9 @@ scanstack(G *gp)
        if(gp == g) {
                // Scanning our own stack: start at &gp.
                sp = (byte*)&gp;
+       } else if(gp->m != nil && gp->m->helpgc) {
+               // Gc helper scans its own stack.
+               return;
        } else {
                // Scanning another goroutine's stack.
                // The goroutine is usually asleep (the world is stopped).
@@ -387,17 +636,27 @@ markfin(void *v)
        scanblock(v, size);
 }
 
-// Mark 
 static void
-mark(void)
+debug_markfin(void *v)
+{
+       uintptr size;
+
+       if(!runtime·mlookup(v, &v, &size, nil))
+               runtime·throw("debug_mark - finalizer inconsistency");
+       debug_scanblock(v, size);
+}
+
+// Mark
+static void
+mark(void (*scan)(byte*, int64))
 {
        G *gp;
 
        // mark data+bss.
        // skip runtime·mheap itself, which has no interesting pointers
        // and is mostly zeroed and would not otherwise be paged in.
-       scanblock(data, (byte*)&runtime·mheap - data);
-       scanblock((byte*)(&runtime·mheap+1), end - (byte*)(&runtime·mheap+1));
+       scan(data, (byte*)&runtime·mheap - data);
+       scan((byte*)(&runtime·mheap+1), end - (byte*)(&runtime·mheap+1));
 
        // mark stacks
        for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
@@ -410,18 +669,24 @@ mark(void)
                case Grunning:
                        if(gp != g)
                                runtime·throw("mark - world not stopped");
-                       scanstack(gp);
+                       scanstack(scan, gp);
                        break;
                case Grunnable:
                case Gsyscall:
                case Gwaiting:
-                       scanstack(gp);
+                       scanstack(scan, gp);
                        break;
                }
        }
 
        // mark things pointed at by objects with finalizers
-       runtime·walkfintab(markfin);
+       if(scan == debug_scanblock)
+               runtime·walkfintab(debug_markfin);
+       else
+               runtime·walkfintab(markfin);
+
+       // in multiproc mode, join in the queued work.
+       scan(nil, 0);
 }
 
 // Sweep frees or calls finalizers for blocks not marked in the mark phase.
@@ -435,8 +700,17 @@ sweep(void)
        byte *p;
        MCache *c;
        Finalizer *f;
+       byte *arena_start;
+
+       arena_start = runtime·mheap.arena_start;
+
+       for(;;) {
+               s = work.spans;
+               if(s == nil)
+                       break;
+               if(!runtime·casp(&work.spans, s, s->allnext))
+                       continue;
 
-       for(s = runtime·mheap.allspans; s != nil; s = s->allnext) {
                if(s->state != MSpanInUse)
                        continue;
 
@@ -451,13 +725,15 @@ sweep(void)
                        npages = runtime·class_to_allocnpages[cl];
                        n = (npages << PageShift) / size;
                }
-       
-               // sweep through n objects of given size starting at p.
+
+               // Sweep through n objects of given size starting at p.
+               // This thread owns the span now, so it can manipulate
+               // the block bitmap without atomic operations.
                for(; n > 0; n--, p += size) {
                        uintptr off, *bitp, shift, bits;
 
-                       off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;
-                       bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+                       off = (uintptr*)p - (uintptr*)arena_start;
+                       bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
                        shift = off % wordsPerBitmapWord;
                        bits = *bitp>>shift;
 
@@ -465,17 +741,27 @@ sweep(void)
                                continue;
 
                        if((bits & bitMarked) != 0) {
+                               if(DebugMark) {
+                                       if(!(bits & bitSpecial))
+                                               runtime·printf("found spurious mark on %p\n", p);
+                                       *bitp &= ~(bitSpecial<<shift);
+                               }
                                *bitp &= ~(bitMarked<<shift);
                                continue;
                        }
 
-                       if((bits & bitSpecial) != 0) {
+                       if(DebugMark || (bits & bitSpecial) != 0) {
                                // Special means it has a finalizer or is being profiled.
+                               // In DebugMark mode, the bit has been coopted so
+                               // we have to assume all blocks are special.
                                f = runtime·getfinalizer(p, 1);
                                if(f != nil) {
                                        f->arg = p;
-                                       f->next = finq;
-                                       finq = f;
+                                       for(;;) {
+                                               f->next = finq;
+                                               if(runtime·casp(&finq, f->next, f))
+                                                       break;
+                                       }
                                        continue;
                                }
                                runtime·MProf_Free(p, size);
@@ -503,6 +789,23 @@ sweep(void)
        }
 }
 
+void
+runtime·gchelper(void)
+{
+       // Wait until main proc is ready for mark help.
+       runtime·lock(&work.markgate);
+       runtime·unlock(&work.markgate);
+       scanblock(nil, 0);
+
+       // Wait until main proc is ready for sweep help.
+       runtime·lock(&work.sweepgate);
+       runtime·unlock(&work.sweepgate);
+       sweep();
+
+       if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
+               runtime·notewakeup(&work.alldone);
+}
+
 // Semaphore, not Lock, so that the goroutine
 // reschedules when there is contention rather
 // than spinning.
@@ -523,7 +826,7 @@ static void
 stealcache(void)
 {
        M *m;
-       
+
        for(m=runtime·allm; m; m=m->alllink)
                runtime·MCache_ReleaseAll(m->mcache);
 }
@@ -562,6 +865,7 @@ runtime·gc(int32 force)
        uint64 heap0, heap1, obj0, obj1;
        byte *p;
        Finalizer *fp;
+       bool extra;
 
        // The gc is turned off (via enablegc) until
        // the bootstrap has completed.
@@ -582,7 +886,7 @@ runtime·gc(int32 force)
                        gcpercent = -1;
                else
                        gcpercent = runtime·atoi(p);
-               
+
                p = runtime·getenv("GOGCTRACE");
                if(p != nil)
                        gctrace = runtime·atoi(p);
@@ -600,6 +904,7 @@ runtime·gc(int32 force)
        nlookup = 0;
        nsizelookup = 0;
        naddrlookup = 0;
+       nhandoff = 0;
 
        m->gcing = 1;
        runtime·stoptheworld();
@@ -608,10 +913,30 @@ runtime·gc(int32 force)
        heap0 = mstats.heap_alloc;
        obj0 = mstats.nmalloc - mstats.nfree;
 
-       mark();
+       runtime·lock(&work.markgate);
+       runtime·lock(&work.sweepgate);
+
+       work.nproc = 1;
+       if(runtime·gomaxprocs > 1 && runtime·ncpu > 1) {
+               runtime·noteclear(&work.alldone);
+               work.nproc += runtime·helpgc();
+       }
+       work.nwait = 0;
+       work.ndone = 0;
+
+       runtime·unlock(&work.markgate);  // let the helpers in
+       mark(scanblock);
+       if(DebugMark)
+               mark(debug_scanblock);
        t1 = runtime·nanotime();
+
+       work.spans = runtime·mheap.allspans;
+       runtime·unlock(&work.sweepgate);  // let the helpers in
        sweep();
+       if(work.nproc > 1)
+               runtime·notesleep(&work.alldone);
        t2 = runtime·nanotime();
+
        stealcache();
        cachestats();
 
@@ -641,22 +966,32 @@ runtime·gc(int32 force)
        mstats.numgc++;
        if(mstats.debuggc)
                runtime·printf("pause %D\n", t3-t0);
-       
+
        if(gctrace) {
-               runtime·printf("gc%d: %D+%D+%D ms %D -> %D MB %D -> %D (%D-%D) objects %D pointer lookups (%D size, %D addr)\n",
-                       mstats.numgc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
+               runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-%D) objects %D pointer lookups (%D size, %D addr) %D handoff\n",
+                       mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
                        heap0>>20, heap1>>20, obj0, obj1,
                        mstats.nmalloc, mstats.nfree,
-                       nlookup, nsizelookup, naddrlookup);
+                       nlookup, nsizelookup, naddrlookup, nhandoff);
        }
 
        runtime·semrelease(&gcsema);
-       runtime·starttheworld();
-       
+
+       // If we could have used another helper proc, start one now,
+       // in the hope that it will be available next time.
+       // It would have been even better to start it before the collection,
+       // but doing so requires allocating memory, so it's tricky to
+       // coordinate.  This lazy approach works out in practice:
+       // we don't mind if the first couple gc rounds don't have quite
+       // the maximum number of procs.
+       extra = work.nproc < runtime·gomaxprocs && work.nproc < MaxGcproc;
+
+       runtime·starttheworld(extra);
+
        // give the queued finalizers, if any, a chance to run
        if(fp != nil)
                runtime·gosched();
-       
+
        if(gctrace > 1 && !force)
                runtime·gc(1);
 }
@@ -674,7 +1009,7 @@ runtime·UpdateMemStats(void)
        cachestats();
        m->gcing = 0;
        runtime·semrelease(&gcsema);
-       runtime·starttheworld();
+       runtime·starttheworld(0);
 }
 
 static void
@@ -858,6 +1193,9 @@ runtime·blockspecial(void *v)
 {
        uintptr *b, off, shift;
 
+       if(DebugMark)
+               return true;
+
        off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;
        b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
        shift = off % wordsPerBitmapWord;
@@ -870,6 +1208,9 @@ runtime·setblockspecial(void *v)
 {
        uintptr *b, off, shift, bits, obits;
 
+       if(DebugMark)
+               return;
+
        off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;
        b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
        shift = off % wordsPerBitmapWord;
@@ -887,7 +1228,7 @@ runtime·setblockspecial(void *v)
                }
        }
 }
+
 void
 runtime·MHeap_MapBits(MHeap *h)
 {
@@ -898,7 +1239,7 @@ runtime·MHeap_MapBits(MHeap *h)
                bitmapChunk = 8192
        };
        uintptr n;
-       
+
        n = (h->arena_used - h->arena_start) / wordsPerBitmapWord;
        n = (n+bitmapChunk-1) & ~(bitmapChunk-1);
        if(h->bitmap_mapped >= n)
index 3ce7794957dc17b868f39be8ba72635ffdd22ae5..0d8caaf91271565c41db58eac026cbade4246817 100644 (file)
@@ -51,7 +51,7 @@ vprintf(int8 *s, byte *base)
        uintptr arg, narg;
        byte *v;
 
-//     lock(&debuglock);
+       //runtime·lock(&debuglock);
 
        lp = p = s;
        arg = 0;
@@ -152,7 +152,7 @@ vprintf(int8 *s, byte *base)
        if(p > lp)
                runtime·write(2, lp, p-lp);
 
-//     unlock(&debuglock);
+       //runtime·unlock(&debuglock);
 }
 
 #pragma textflag 7
@@ -348,4 +348,4 @@ runtime·typestring(Eface e, String s)
        s = *e.type->string;
        FLUSH(&s);
 }
-       
+
index cc075741d1be4d85c1e208fca94e5f70100f1695..39e3fa02304e898062439ffc3c9c295571bb20cf 100644 (file)
@@ -15,6 +15,7 @@ static void unwindstack(G*, byte*);
 static void schedule(G*);
 static void acquireproc(void);
 static void releaseproc(void);
+static M *startm(void);
 
 typedef struct Sched Sched;
 
@@ -323,6 +324,9 @@ mcommoninit(M *m)
        m->fastrand = 0x49f6428aUL + m->id;
        m->stackalloc = runtime·malloc(sizeof(*m->stackalloc));
        runtime·FixAlloc_Init(m->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
+
+       if(m->mcache == nil)
+               m->mcache = runtime·allocmcache();
 }
 
 // Try to increment mcpu.  Report whether succeeded.
@@ -422,7 +426,7 @@ mget(G *g)
        M *m;
 
        // if g has its own m, use it.
-       if((m = g->lockedm) != nil)
+       if(g && (m = g->lockedm) != nil)
                return m;
 
        // otherwise use general m pool.
@@ -507,6 +511,7 @@ nextgandunlock(void)
        G *gp;
        uint32 v;
 
+top:
        if(atomic_mcpu(runtime·sched.atomic) >= maxgomaxprocs)
                runtime·throw("negative mcpu");
 
@@ -584,12 +589,49 @@ nextgandunlock(void)
        schedunlock();
 
        runtime·notesleep(&m->havenextg);
+       if(m->helpgc) {
+               runtime·gchelper();
+               m->helpgc = 0;
+               runtime·lock(&runtime·sched);
+               goto top;
+       }
        if((gp = m->nextg) == nil)
                runtime·throw("bad m->nextg in nextgoroutine");
        m->nextg = nil;
        return gp;
 }
 
+int32
+runtime·helpgc(void)
+{
+       M *m;
+       int32 n, max;
+
+       // Figure out how many CPUs to use.
+       // Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
+       max = runtime·gomaxprocs;
+       if(max > runtime·ncpu)
+               max = runtime·ncpu;
+       if(max > MaxGcproc)
+               max = MaxGcproc;
+
+
+       // We're going to use one CPU no matter what.
+       // Figure out the max number of additional CPUs.
+       max--;
+
+       runtime·lock(&runtime·sched);
+       n = 0;
+       while(n < max && (m = mget(nil)) != nil) {
+               n++;
+               m->helpgc = 1;
+               m->waitnextg = 0;
+               runtime·notewakeup(&m->havenextg);
+       }
+       runtime·unlock(&runtime·sched);
+       return n;
+}
+
 void
 runtime·stoptheworld(void)
 {
@@ -626,15 +668,28 @@ runtime·stoptheworld(void)
        schedunlock();
 }
 
-// TODO(rsc): Remove. This is only temporary,
-// for the mark and sweep collector.
 void
-runtime·starttheworld(void)
+runtime·starttheworld(bool extra)
 {
+       M *m;
+
        schedlock();
        runtime·gcwaiting = 0;
        setmcpumax(runtime·gomaxprocs);
        matchmg();
+       if(extra && canaddmcpu()) {
+               // Start a new m that will (we hope) be idle
+               // and so available to help when the next
+               // garbage collection happens.
+               // canaddmcpu above did mcpu++
+               // (necessary, because m will be doing various
+               // initialization work so is definitely running),
+               // but m is not running a specific goroutine,
+               // so set the helpgc flag as a signal to m's
+               // first schedule(nil) to mcpu--.
+               m = startm();
+               m->helpgc = 1;
+       }
        schedunlock();
 }
 
@@ -644,8 +699,6 @@ runtime·mstart(void)
 {
        if(g != m->g0)
                runtime·throw("bad runtime·mstart");
-       if(m->mcache == nil)
-               m->mcache = runtime·allocmcache();
 
        // Record top of stack for use by mcall.
        // Once we call schedule we're never coming back,
@@ -677,46 +730,55 @@ struct CgoThreadStart
 static void
 matchmg(void)
 {
-       G *g;
+       G *gp;
+       M *mp;
 
        if(m->mallocing || m->gcing)
                return;
 
        while(haveg() && canaddmcpu()) {
-               g = gget();
-               if(g == nil)
+               gp = gget();
+               if(gp == nil)
                        runtime·throw("gget inconsistency");
 
-               // Find the m that will run g.
-               M *m;
-               if((m = mget(g)) == nil){
-                       m = runtime·malloc(sizeof(M));
-                       mcommoninit(m);
-
-                       if(runtime·iscgo) {
-                               CgoThreadStart ts;
-
-                               if(libcgo_thread_start == nil)
-                                       runtime·throw("libcgo_thread_start missing");
-                               // pthread_create will make us a stack.
-                               m->g0 = runtime·malg(-1);
-                               ts.m = m;
-                               ts.g = m->g0;
-                               ts.fn = runtime·mstart;
-                               runtime·asmcgocall(libcgo_thread_start, &ts);
-                       } else {
-                               if(Windows)
-                                       // windows will layout sched stack on os stack
-                                       m->g0 = runtime·malg(-1);
-                               else
-                                       m->g0 = runtime·malg(8192);
-                               runtime·newosproc(m, m->g0, m->g0->stackbase, runtime·mstart);
-                       }
-               }
-               mnextg(m, g);
+               // Find the m that will run gp.
+               if((mp = mget(gp)) == nil)
+                       mp = startm();
+               mnextg(mp, gp);
        }
 }
 
+static M*
+startm(void)
+{
+       M *m;
+
+       m = runtime·malloc(sizeof(M));
+       mcommoninit(m);
+
+       if(runtime·iscgo) {
+               CgoThreadStart ts;
+
+               if(libcgo_thread_start == nil)
+                       runtime·throw("libcgo_thread_start missing");
+               // pthread_create will make us a stack.
+               m->g0 = runtime·malg(-1);
+               ts.m = m;
+               ts.g = m->g0;
+               ts.fn = runtime·mstart;
+               runtime·asmcgocall(libcgo_thread_start, &ts);
+       } else {
+               if(Windows)
+                       // windows will layout sched stack on os stack
+                       m->g0 = runtime·malg(-1);
+               else
+                       m->g0 = runtime·malg(8192);
+               runtime·newosproc(m, m->g0, m->g0->stackbase, runtime·mstart);
+       }
+
+       return m;
+}
+
 // One round of scheduler: find a goroutine and run it.
 // The argument is the goroutine that was running before
 // schedule was called, or nil if this is the first call.
@@ -767,6 +829,12 @@ schedule(G *gp)
                        gp->readyonstop = 0;
                        readylocked(gp);
                }
+       } else if(m->helpgc) {
+               // atomic { mcpu-- }
+               v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
+               if(atomic_mcpu(v) > maxgomaxprocs)
+                       runtime·throw("negative mcpu in scheduler");
+               m->helpgc = 0;
        }
 
        // Find (or wait for) g to run.  Unlocks runtime·sched.
@@ -1097,7 +1165,7 @@ runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
 //printf("newproc1 %p %p narg=%d nret=%d\n", fn, argp, narg, nret);
        siz = narg + nret;
        siz = (siz+7) & ~7;
-       
+
        // We could instead create a secondary stack frame
        // and make it look like goexit was on the original but
        // the call to the actual goroutine function was split.
index 999511ac28845bbb3047e0ea102a3b44e3c8d707..63f7d65dfbb16acab9cb339392d27ac8c2b5bbef 100644 (file)
@@ -57,7 +57,7 @@ typedef       struct  String          String;
 typedef        struct  Usema           Usema;
 typedef        struct  SigTab          SigTab;
 typedef        struct  MCache          MCache;
-typedef struct FixAlloc        FixAlloc;
+typedef        struct  FixAlloc        FixAlloc;
 typedef        struct  Iface           Iface;
 typedef        struct  Itab            Itab;
 typedef        struct  Eface           Eface;
@@ -238,6 +238,7 @@ struct      M
        int32   waitnextg;
        int32   dying;
        int32   profilehz;
+       int32   helpgc;
        uint32  fastrand;
        uint64  ncgocall;
        Note    havenextg;
@@ -406,6 +407,7 @@ extern      bool    runtime·singleproc;
 extern uint32  runtime·panicking;
 extern int32   runtime·gcwaiting;             // gc is waiting to run
 int8*  runtime·goos;
+int32  runtime·ncpu;
 extern bool    runtime·iscgo;
 extern void    (*runtime·destroylock)(Lock*);
 
@@ -515,6 +517,7 @@ void        runtime·startpanic(void);
 void   runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp);
 void   runtime·resetcpuprofiler(int32);
 void   runtime·setcpuprofilerate(void(*)(uintptr*, int32), int32);
+void   runtime·usleep(uint32);
 
 #pragma        varargck        argpos  runtime·printf 1
 #pragma        varargck        type    "d"     int32
@@ -534,7 +537,7 @@ void        runtime·setcpuprofilerate(void(*)(uintptr*, int32), int32);
 // TODO(rsc): Remove. These are only temporary,
 // for the mark and sweep collector.
 void   runtime·stoptheworld(void);
-void   runtime·starttheworld(void);
+void   runtime·starttheworld(bool);
 
 /*
  * mutual exclusion locks.  in the uncontended case,
index e833843826558b768f922049bd4085134ebcb13e..acf98a7dc437ddb7c5a53aa487438a2acd1fa120 100644 (file)
@@ -18,7 +18,7 @@ all: $(addsuffix .out, $(ALL))
        $(LD) -o $@ $*.$O
 
 %.bench: %.out
-       ./$*.out
+       time ./$*.out
 
 bench: $(addsuffix .bench, $(ALL))
 
index 19a96bc63bd8f0fd859ac330bad681e87b78194e..06cc48384a17cde929d91cbc233912aeabc0c2e0 100644 (file)
@@ -73,10 +73,6 @@ func parseDir(dirpath string) map[string]*ast.Package {
 }
 
 func main() {
-       runtime.GOMAXPROCS(4)
-       go func() {}()
-       go func() {}()
-       go func() {}()
        st := &runtime.MemStats
        packages = append(packages, packages...)
        packages = append(packages, packages...)
@@ -132,7 +128,6 @@ func main() {
        }
 }
 
-
 var packages = []string{
        "archive/tar",
        "asn1",
@@ -148,7 +143,6 @@ var packages = []string{
        "container/ring",
        "container/vector",
        "crypto/aes",
-       "crypto/block",
        "crypto/blowfish",
        "crypto/hmac",
        "crypto/md4",
@@ -167,7 +161,6 @@ var packages = []string{
        "debug/macho",
        "debug/elf",
        "debug/gosym",
-       "debug/proc",
        "ebnf",
        "encoding/ascii85",
        "encoding/base64",
@@ -177,9 +170,6 @@ var packages = []string{
        "encoding/pem",
        "exec",
        "exp/datafmt",
-       "exp/draw",
-       "exp/eval",
-       "exp/iterable",
        "expvar",
        "flag",
        "fmt",