From: Russ Cox <rsc@golang.org>
Date: Sat, 15 Nov 2014 13:00:38 +0000 (-0500)
Subject: [dev.garbage] all: merge dev.cc into dev.garbage
X-Git-Tag: go1.5beta1~2684^2~10
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=0fcf54b3d2bc42a947c65e9a520d078b671f8432;p=gostls13.git

[dev.garbage] all: merge dev.cc into dev.garbage

The garbage collector is now written in Go.
There is plenty to clean up (just like on dev.cc).

all.bash passes on darwin/amd64, darwin/386, linux/amd64, linux/386.

TBR=rlh
R=austin, rlh, bradfitz
CC=golang-codereviews
https://golang.org/cl/173250043
---

0fcf54b3d2bc42a947c65e9a520d078b671f8432
diff --cc src/runtime/lfstack.go
index 0000000000,4a20fff9d8..a4ad8a10c6
mode 000000,100644..100644
--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go
@@@ -1,0 -1,40 +1,36 @@@
+ // Copyright 2012 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // Lock-free stack.
+ // The following code runs only on g0 stack.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ func lfstackpush(head *uint64, node *lfnode) {
+ 	node.pushcnt++
+ 	new := lfstackPack(node, node.pushcnt)
+ 	for {
+ 		old := atomicload64(head)
 -		node.next, _ = lfstackUnpack(old)
++		node.next = old
+ 		if cas64(head, old, new) {
+ 			break
+ 		}
+ 	}
+ }
+ 
+ func lfstackpop(head *uint64) unsafe.Pointer {
+ 	for {
+ 		old := atomicload64(head)
+ 		if old == 0 {
+ 			return nil
+ 		}
+ 		node, _ := lfstackUnpack(old)
 -		node2 := (*lfnode)(atomicloadp(unsafe.Pointer(&node.next)))
 -		new := uint64(0)
 -		if node2 != nil {
 -			new = lfstackPack(node2, node2.pushcnt)
 -		}
 -		if cas64(head, old, new) {
++		next := atomicload64(&node.next)
++		if cas64(head, old, next) {
+ 			return unsafe.Pointer(node)
+ 		}
+ 	}
+ }
diff --cc src/runtime/malloc.go
index fab8cf2695,20cb6818d2..f90a8f84a3
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@@ -306,18 -295,6 +297,17 @@@ func mallocgc(size uintptr, typ *_type
  		}
  	}
  marked:
 +
 +	// GCmarkterminate allocates black
 +	// All slots hold nil so no scanning is needed.
 +	// This may be racing with GC so do it atomically if there can be
 +	// a race marking the bit.
 +	if gcphase == _GCmarktermination {
- 		mp := acquirem()
- 		mp.ptrarg[0] = x
- 		onM(gcmarknewobject_m)
- 		releasem(mp)
++		systemstack(func() {
++			gcmarknewobject_m(uintptr(x))
++		})
 +	}
 +
  	if raceenabled {
  		racemalloc(x, size)
  	}
@@@ -358,37 -335,6 +348,36 @@@
  	return x
  }
  
 +func loadPtrMask(typ *_type) []uint8 {
 +	var ptrmask *uint8
 +	nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
 +	if typ.kind&kindGCProg != 0 {
 +		masksize := nptr
 +		if masksize%2 != 0 {
 +			masksize *= 2 // repeated
 +		}
 +		masksize = masksize * pointersPerByte / 8 // 4 bits per word
 +		masksize++                                // unroll flag in the beginning
 +		if masksize > maxGCMask && typ.gc[1] != 0 {
 +			// write barriers have not been updated to deal with this case yet.
 +			gothrow("maxGCMask too small for now")
 +		}
 +		ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
 +		// Check whether the program is already unrolled
 +		// by checking if the unroll flag byte is set
 +		maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
 +		if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
- 			mp := acquirem()
- 			mp.ptrarg[0] = unsafe.Pointer(typ)
- 			onM(unrollgcprog_m)
- 			releasem(mp)
++			systemstack(func() {
++				unrollgcprog_m(typ)
++			})
 +		}
 +		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
 +	} else {
 +		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
 +	}
 +	return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+1)/2]
 +}
 +
  // implementation of new builtin
  func newobject(typ *_type) unsafe.Pointer {
  	flags := uint32(0)
@@@ -483,20 -429,7 +472,21 @@@ func gogc(force int32) 
  	mp = acquirem()
  	mp.gcing = 1
  	releasem(mp)
 +
- 	onM(stoptheworld)
- 	onM(finishsweep_m) // finish sweep before we start concurrent scan.
- 	if false {         // To turn on concurrent scan and mark set to true...
- 		onM(starttheworld)
+ 	systemstack(stoptheworld)
++	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
++	if false {                 // To turn on concurrent scan and mark set to true...
++		systemstack(starttheworld)
 +		// Do a concurrent heap scan before we stop the world.
- 		onM(gcscan_m)
- 		onM(stoptheworld)
- 		onM(gcinstallmarkwb_m)
- 		onM(starttheworld)
- 		onM(gcmark_m)
- 		onM(stoptheworld)
- 		onM(gcinstalloffwb_m)
++		systemstack(gcscan_m)
++		systemstack(stoptheworld)
++		systemstack(gcinstallmarkwb_m)
++		systemstack(starttheworld)
++		systemstack(gcmark_m)
++		systemstack(stoptheworld)
++		systemstack(gcinstalloffwb_m)
 +	}
++
  	if mp != acquirem() {
  		gothrow("gogc: rescheduled")
  	}
@@@ -512,23 -445,17 +502,21 @@@
  	if debug.gctrace > 1 {
  		n = 2
  	}
++	eagersweep := force >= 2
  	for i := 0; i < n; i++ {
  		if i > 0 {
  			startTime = nanotime()
  		}
  		// switch to g0, call gc, then switch back
- 		mp.scalararg[0] = uintptr(uint32(startTime)) // low 32 bits
- 		mp.scalararg[1] = uintptr(startTime >> 32)   // high 32 bits
- 		if force >= 2 {
- 			mp.scalararg[2] = 1 // eagersweep
- 		} else {
- 			mp.scalararg[2] = 0
- 		}
- 		onM(gc_m)
 -		eagersweep := force >= 2
+ 		systemstack(func() {
+ 			gc_m(startTime, eagersweep)
+ 		})
  	}
  
- 	onM(gccheckmark_m)
++	systemstack(func() {
++		gccheckmark_m(startTime, eagersweep)
++	})
 +
  	// all done
  	mp.gcing = 0
  	semrelease(&worldsema)
@@@ -543,14 -470,6 +531,14 @@@
  	}
  }
  
 +func GCcheckmarkenable() {
- 	onM(gccheckmarkenable_m)
++	systemstack(gccheckmarkenable_m)
 +}
 +
 +func GCcheckmarkdisable() {
- 	onM(gccheckmarkdisable_m)
++	systemstack(gccheckmarkdisable_m)
 +}
 +
  // GC runs a garbage collection.
  func GC() {
  	gogc(2)
diff --cc src/runtime/malloc2.go
index 0000000000,e4bd963d30..4ac0207b1e
mode 000000,100644..100644
--- a/src/runtime/malloc2.go
+++ b/src/runtime/malloc2.go
@@@ -1,0 -1,475 +1,473 @@@
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ // Memory allocator, based on tcmalloc.
+ // http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+ 
+ // The main allocator works in runs of pages.
+ // Small allocation sizes (up to and including 32 kB) are
+ // rounded to one of about 100 size classes, each of which
+ // has its own free list of objects of exactly that size.
+ // Any free page of memory can be split into a set of objects
+ // of one size class, which are then managed using free list
+ // allocators.
+ //
+ // The allocator's data structures are:
+ //
+ //	FixAlloc: a free-list allocator for fixed-size objects,
+ //		used to manage storage used by the allocator.
+ //	MHeap: the malloc heap, managed at page (4096-byte) granularity.
+ //	MSpan: a run of pages managed by the MHeap.
+ //	MCentral: a shared free list for a given size class.
+ //	MCache: a per-thread (in Go, per-P) cache for small objects.
+ //	MStats: allocation statistics.
+ //
+ // Allocating a small object proceeds up a hierarchy of caches:
+ //
+ //	1. Round the size up to one of the small size classes
+ //	   and look in the corresponding MCache free list.
+ //	   If the list is not empty, allocate an object from it.
+ //	   This can all be done without acquiring a lock.
+ //
+ //	2. If the MCache free list is empty, replenish it by
+ //	   taking a bunch of objects from the MCentral free list.
+ //	   Moving a bunch amortizes the cost of acquiring the MCentral lock.
+ //
+ //	3. If the MCentral free list is empty, replenish it by
+ //	   allocating a run of pages from the MHeap and then
+ //	   chopping that memory into a objects of the given size.
+ //	   Allocating many objects amortizes the cost of locking
+ //	   the heap.
+ //
+ //	4. If the MHeap is empty or has no page runs large enough,
+ //	   allocate a new group of pages (at least 1MB) from the
+ //	   operating system.  Allocating a large run of pages
+ //	   amortizes the cost of talking to the operating system.
+ //
+ // Freeing a small object proceeds up the same hierarchy:
+ //
+ //	1. Look up the size class for the object and add it to
+ //	   the MCache free list.
+ //
+ //	2. If the MCache free list is too long or the MCache has
+ //	   too much memory, return some to the MCentral free lists.
+ //
+ //	3. If all the objects in a given span have returned to
+ //	   the MCentral list, return that span to the page heap.
+ //
+ //	4. If the heap has too much memory, return some to the
+ //	   operating system.
+ //
+ //	TODO(rsc): Step 4 is not implemented.
+ //
+ // Allocating and freeing a large object uses the page heap
+ // directly, bypassing the MCache and MCentral free lists.
+ //
+ // The small objects on the MCache and MCentral free lists
+ // may or may not be zeroed.  They are zeroed if and only if
+ // the second word of the object is zero.  A span in the
+ // page heap is zeroed unless s->needzero is set. When a span
+ // is allocated to break into small objects, it is zeroed if needed
+ // and s->needzero is set. There are two main benefits to delaying the
+ // zeroing this way:
+ //
+ //	1. stack frames allocated from the small object lists
+ //	   or the page heap can avoid zeroing altogether.
+ //	2. the cost of zeroing when reusing a small object is
+ //	   charged to the mutator, not the garbage collector.
+ //
+ // This C code was written with an eye toward translating to Go
+ // in the future.  Methods have the form Type_Method(Type *t, ...).
+ 
+ const (
+ 	_PageShift = 13
+ 	_PageSize  = 1 << _PageShift
+ 	_PageMask  = _PageSize - 1
+ )
+ 
+ const (
+ 	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
+ 	_64bit = 1 << (^uintptr(0) >> 63) / 2
+ 
+ 	// Computed constant.  The definition of MaxSmallSize and the
+ 	// algorithm in msize.c produce some number of different allocation
+ 	// size classes.  NumSizeClasses is that number.  It's needed here
+ 	// because there are static arrays of this length; when msize runs its
+ 	// size choosing algorithm it double-checks that NumSizeClasses agrees.
+ 	_NumSizeClasses = 67
+ 
+ 	// Tunable constants.
+ 	_MaxSmallSize = 32 << 10
+ 
+ 	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
+ 	_TinySize      = 16
+ 	_TinySizeClass = 2
+ 
+ 	_FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
+ 	_MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
+ 	_HeapAllocChunk = 1 << 20                // Chunk size for heap growth
+ 
+ 	// Per-P, per order stack segment cache size.
+ 	_StackCacheSize = 32 * 1024
+ 
+ 	// Number of orders that get caching.  Order 0 is FixedStack
+ 	// and each successive order is twice as large.
+ 	_NumStackOrders = 3
+ 
+ 	// Number of bits in page to span calculations (4k pages).
+ 	// On Windows 64-bit we limit the arena to 32GB or 35 bits.
+ 	// Windows counts memory used by page table into committed memory
+ 	// of the process, so we can't reserve too much memory.
+ 	// See http://golang.org/issue/5402 and http://golang.org/issue/5236.
+ 	// On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
+ 	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
+ 	_MHeapMap_TotalBits = (_64bit*_Windows)*35 + (_64bit*(1-_Windows))*37 + (1-_64bit)*32
+ 	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
+ 
+ 	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
+ 
+ 	// Max number of threads to run garbage collection.
+ 	// 2, 3, and 4 are all plausible maximums depending
+ 	// on the hardware details of the machine.  The garbage
+ 	// collector scales well to 32 cpus.
+ 	_MaxGcproc = 32
+ )
+ 
+ // A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+ type mlink struct {
+ 	next *mlink
+ }
+ 
+ // sysAlloc obtains a large chunk of zeroed memory from the
+ // operating system, typically on the order of a hundred kilobytes
+ // or a megabyte.
+ // NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
+ // may use larger alignment, so the caller must be careful to realign the
+ // memory obtained by sysAlloc.
+ //
+ // SysUnused notifies the operating system that the contents
+ // of the memory region are no longer needed and can be reused
+ // for other purposes.
+ // SysUsed notifies the operating system that the contents
+ // of the memory region are needed again.
+ //
+ // SysFree returns it unconditionally; this is only used if
+ // an out-of-memory error has been detected midway through
+ // an allocation.  It is okay if SysFree is a no-op.
+ //
+ // SysReserve reserves address space without allocating memory.
+ // If the pointer passed to it is non-nil, the caller wants the
+ // reservation there, but SysReserve can still choose another
+ // location if that one is unavailable.  On some systems and in some
+ // cases SysReserve will simply check that the address space is
+ // available and not actually reserve it.  If SysReserve returns
+ // non-nil, it sets *reserved to true if the address space is
+ // reserved, false if it has merely been checked.
+ // NOTE: SysReserve returns OS-aligned memory, but the heap allocator
+ // may use larger alignment, so the caller must be careful to realign the
+ // memory obtained by sysAlloc.
+ //
+ // SysMap maps previously reserved address space for use.
+ // The reserved argument is true if the address space was really
+ // reserved, not merely checked.
+ //
+ // SysFault marks a (already sysAlloc'd) region to fault
+ // if accessed.  Used only for debugging the runtime.
+ 
+ // FixAlloc is a simple free-list allocator for fixed size objects.
+ // Malloc uses a FixAlloc wrapped around sysAlloc to manages its
+ // MCache and MSpan objects.
+ //
+ // Memory returned by FixAlloc_Alloc is not zeroed.
+ // The caller is responsible for locking around FixAlloc calls.
+ // Callers can keep state in the object but the first word is
+ // smashed by freeing and reallocating.
+ type fixalloc struct {
+ 	size   uintptr
+ 	first  unsafe.Pointer // go func(unsafe.pointer, unsafe.pointer); f(arg, p) called first time p is returned
+ 	arg    unsafe.Pointer
+ 	list   *mlink
+ 	chunk  *byte
+ 	nchunk uint32
+ 	inuse  uintptr // in-use bytes now
+ 	stat   *uint64
+ }
+ 
+ // Statistics.
+ // Shared with Go: if you edit this structure, also edit type MemStats in mem.go.
+ type mstats struct {
+ 	// General statistics.
+ 	alloc       uint64 // bytes allocated and still in use
+ 	total_alloc uint64 // bytes allocated (even if freed)
+ 	sys         uint64 // bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
+ 	nlookup     uint64 // number of pointer lookups
+ 	nmalloc     uint64 // number of mallocs
+ 	nfree       uint64 // number of frees
+ 
+ 	// Statistics about malloc heap.
+ 	// protected by mheap.lock
+ 	heap_alloc    uint64 // bytes allocated and still in use
+ 	heap_sys      uint64 // bytes obtained from system
+ 	heap_idle     uint64 // bytes in idle spans
+ 	heap_inuse    uint64 // bytes in non-idle spans
+ 	heap_released uint64 // bytes released to the os
+ 	heap_objects  uint64 // total number of allocated objects
+ 
+ 	// Statistics about allocation of low-level fixed-size structures.
+ 	// Protected by FixAlloc locks.
+ 	stacks_inuse uint64 // this number is included in heap_inuse above
+ 	stacks_sys   uint64 // always 0 in mstats
+ 	mspan_inuse  uint64 // mspan structures
+ 	mspan_sys    uint64
+ 	mcache_inuse uint64 // mcache structures
+ 	mcache_sys   uint64
+ 	buckhash_sys uint64 // profiling bucket hash table
+ 	gc_sys       uint64
+ 	other_sys    uint64
+ 
+ 	// Statistics about garbage collector.
+ 	// Protected by mheap or stopping the world during GC.
+ 	next_gc        uint64 // next gc (in heap_alloc time)
+ 	last_gc        uint64 // last gc (in absolute time)
+ 	pause_total_ns uint64
+ 	pause_ns       [256]uint64 // circular buffer of recent gc pause lengths
+ 	pause_end      [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
+ 	numgc          uint32
+ 	enablegc       bool
+ 	debuggc        bool
+ 
+ 	// Statistics about allocation size classes.
+ 
+ 	by_size [_NumSizeClasses]struct {
+ 		size    uint32
+ 		nmalloc uint64
+ 		nfree   uint64
+ 	}
+ 
+ 	tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+ }
+ 
+ var memstats mstats
+ 
+ // Size classes.  Computed and initialized by InitSizes.
+ //
+ // SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
+ //	1 <= sizeclass < NumSizeClasses, for n.
+ //	Size class 0 is reserved to mean "not small".
+ //
+ // class_to_size[i] = largest size in class i
+ // class_to_allocnpages[i] = number of pages to allocate when
+ //	making new objects in class i
+ 
+ var class_to_size [_NumSizeClasses]int32
+ var class_to_allocnpages [_NumSizeClasses]int32
+ var size_to_class8 [1024/8 + 1]int8
+ var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
+ 
+ type mcachelist struct {
+ 	list  *mlink
+ 	nlist uint32
+ }
+ 
+ type stackfreelist struct {
+ 	list *mlink  // linked list of free stacks
+ 	size uintptr // total size of stacks in list
+ }
+ 
+ // Per-thread (in Go, per-P) cache for small objects.
+ // No locking needed because it is per-thread (per-P).
+ type mcache struct {
+ 	// The following members are accessed on every malloc,
+ 	// so they are grouped here for better caching.
+ 	next_sample      int32  // trigger heap sample after allocating this many bytes
+ 	local_cachealloc intptr // bytes allocated (or freed) from cache since last lock of heap
+ 	// Allocator cache for tiny objects w/o pointers.
+ 	// See "Tiny allocator" comment in malloc.goc.
+ 	tiny             *byte
+ 	tinysize         uintptr
+ 	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+ 
+ 	// The rest is not accessed on every malloc.
+ 	alloc [_NumSizeClasses]*mspan // spans to allocate from
+ 
+ 	stackcache [_NumStackOrders]stackfreelist
+ 
+ 	sudogcache *sudog
+ 
 -	gcworkbuf unsafe.Pointer
 -
+ 	// Local allocator stats, flushed during GC.
+ 	local_nlookup    uintptr                  // number of pointer lookups
+ 	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
+ 	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
+ 	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+ }
+ 
+ const (
+ 	_KindSpecialFinalizer = 1
+ 	_KindSpecialProfile   = 2
+ 	// Note: The finalizer special must be first because if we're freeing
+ 	// an object, a finalizer special will cause the freeing operation
+ 	// to abort, and we want to keep the other special records around
+ 	// if that happens.
+ )
+ 
+ type special struct {
+ 	next   *special // linked list in span
+ 	offset uint16   // span offset of object
+ 	kind   byte     // kind of special
+ }
+ 
+ // The described object has a finalizer set for it.
+ type specialfinalizer struct {
+ 	special special
+ 	fn      *funcval
+ 	nret    uintptr
+ 	fint    *_type
+ 	ot      *ptrtype
+ }
+ 
+ // The described object is being heap profiled.
+ type specialprofile struct {
+ 	special special
+ 	b       *bucket
+ }
+ 
+ // An MSpan is a run of pages.
+ const (
+ 	_MSpanInUse = iota // allocated for garbage collected heap
+ 	_MSpanStack        // allocated for use by stack allocator
+ 	_MSpanFree
+ 	_MSpanListHead
+ 	_MSpanDead
+ )
+ 
+ type mspan struct {
+ 	next     *mspan  // in a span linked list
+ 	prev     *mspan  // in a span linked list
+ 	start    pageID  // starting page number
+ 	npages   uintptr // number of pages in span
+ 	freelist *mlink  // list of free objects
+ 	// sweep generation:
+ 	// if sweepgen == h->sweepgen - 2, the span needs sweeping
+ 	// if sweepgen == h->sweepgen - 1, the span is currently being swept
+ 	// if sweepgen == h->sweepgen, the span is swept and ready to use
+ 	// h->sweepgen is incremented by 2 after every GC
+ 	sweepgen    uint32
+ 	ref         uint16   // capacity - number of objects in freelist
+ 	sizeclass   uint8    // size class
+ 	incache     bool     // being used by an mcache
+ 	state       uint8    // mspaninuse etc
+ 	needzero    uint8    // needs to be zeroed before allocation
+ 	elemsize    uintptr  // computed from sizeclass or from npages
+ 	unusedsince int64    // first time spotted by gc in mspanfree state
+ 	npreleased  uintptr  // number of pages released to the os
+ 	limit       uintptr  // end of data in span
+ 	speciallock mutex    // guards specials list
+ 	specials    *special // linked list of special records sorted by offset.
+ }
+ 
+ // Every MSpan is in one doubly-linked list,
+ // either one of the MHeap's free lists or one of the
+ // MCentral's span lists.  We use empty MSpan structures as list heads.
+ 
+ // Central list of free objects of a given size.
+ type mcentral struct {
+ 	lock      mutex
+ 	sizeclass int32
+ 	nonempty  mspan // list of spans with a free object
+ 	empty     mspan // list of spans with no free objects (or cached in an mcache)
+ }
+ 
+ // Main malloc heap.
+ // The heap itself is the "free[]" and "large" arrays,
+ // but all the other global data is here too.
+ type mheap struct {
+ 	lock      mutex
+ 	free      [_MaxMHeapList]mspan // free lists of given length
+ 	freelarge mspan                // free lists length >= _MaxMHeapList
+ 	busy      [_MaxMHeapList]mspan // busy lists of large objects of given length
+ 	busylarge mspan                // busy lists of large objects length >= _MaxMHeapList
+ 	allspans  **mspan              // all spans out there
+ 	gcspans   **mspan              // copy of allspans referenced by gc marker or sweeper
+ 	nspan     uint32
+ 	sweepgen  uint32 // sweep generation, see comment in mspan
+ 	sweepdone uint32 // all spans are swept
+ 
+ 	// span lookup
+ 	spans        **mspan
+ 	spans_mapped uintptr
+ 
+ 	// range of addresses we might see in the heap
+ 	bitmap         uintptr
+ 	bitmap_mapped  uintptr
+ 	arena_start    uintptr
+ 	arena_used     uintptr
+ 	arena_end      uintptr
+ 	arena_reserved bool
+ 
+ 	// central free lists for small size classes.
+ 	// the padding makes sure that the MCentrals are
+ 	// spaced CacheLineSize bytes apart, so that each MCentral.lock
+ 	// gets its own cache line.
+ 	central [_NumSizeClasses]struct {
+ 		mcentral mcentral
+ 		pad      [_CacheLineSize]byte
+ 	}
+ 
+ 	spanalloc             fixalloc // allocator for span*
+ 	cachealloc            fixalloc // allocator for mcache*
+ 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
+ 	specialprofilealloc   fixalloc // allocator for specialprofile*
+ 	speciallock           mutex    // lock for sepcial record allocators.
+ 
+ 	// Malloc stats.
+ 	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
+ 	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
+ 	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+ }
+ 
+ var mheap_ mheap
+ 
+ const (
+ 	// flags to malloc
+ 	_FlagNoScan = 1 << 0 // GC doesn't have to scan object
+ 	_FlagNoZero = 1 << 1 // don't zero memory
+ )
+ 
+ // NOTE: Layout known to queuefinalizer.
+ type finalizer struct {
+ 	fn   *funcval       // function to call
+ 	arg  unsafe.Pointer // ptr to object
+ 	nret uintptr        // bytes of return values from fn
+ 	fint *_type         // type of first argument of fn
+ 	ot   *ptrtype       // type of ptr to object
+ }
+ 
+ type finblock struct {
+ 	alllink *finblock
+ 	next    *finblock
+ 	cnt     int32
+ 	cap     int32
+ 	fin     [1]finalizer
+ }
+ 
+ // Information from the compiler about the layout of stack frames.
+ type bitvector struct {
+ 	n        int32 // # of bits
+ 	bytedata *uint8
+ }
+ 
+ type stackmap struct {
+ 	n        int32   // number of bitmaps
+ 	nbit     int32   // number of bits in each bitmap
+ 	bytedata [0]byte // bitmaps, each starting on a 32-bit boundary
+ }
+ 
+ // Returns pointer map data for the given stackmap index
+ // (the index is encoded in PCDATA_StackMapIndex).
+ 
+ // defined in mgc0.go
diff --cc src/runtime/mcache.go
index 0000000000,d3afef6be6..08b1bc3597
mode 000000,100644..100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@@ -1,0 -1,86 +1,91 @@@
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // Per-P malloc cache for small objects.
+ //
+ // See malloc.h for an overview.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ // dummy MSpan that contains no free objects.
+ var emptymspan mspan
+ 
+ func allocmcache() *mcache {
+ 	lock(&mheap_.lock)
+ 	c := (*mcache)(fixAlloc_Alloc(&mheap_.cachealloc))
+ 	unlock(&mheap_.lock)
+ 	memclr(unsafe.Pointer(c), unsafe.Sizeof(*c))
+ 	for i := 0; i < _NumSizeClasses; i++ {
+ 		c.alloc[i] = &emptymspan
+ 	}
+ 
+ 	// Set first allocation sample size.
+ 	rate := MemProfileRate
+ 	if rate > 0x3fffffff { // make 2*rate not overflow
+ 		rate = 0x3fffffff
+ 	}
+ 	if rate != 0 {
+ 		c.next_sample = int32(int(fastrand1()) % (2 * rate))
+ 	}
+ 
+ 	return c
+ }
+ 
+ func freemcache(c *mcache) {
+ 	systemstack(func() {
+ 		mCache_ReleaseAll(c)
+ 		stackcache_clear(c)
 -		gcworkbuffree(c.gcworkbuf)
++
++		// NOTE(rsc,rlh): If gcworkbuffree comes back, we need to coordinate
++		// with the stealing of gcworkbufs during garbage collection to avoid
++		// a race where the workbuf is double-freed.
++		// gcworkbuffree(c.gcworkbuf)
++
+ 		lock(&mheap_.lock)
+ 		purgecachedstats(c)
+ 		fixAlloc_Free(&mheap_.cachealloc, unsafe.Pointer(c))
+ 		unlock(&mheap_.lock)
+ 	})
+ }
+ 
+ // Gets a span that has a free object in it and assigns it
+ // to be the cached span for the given sizeclass.  Returns this span.
+ func mCache_Refill(c *mcache, sizeclass int32) *mspan {
+ 	_g_ := getg()
+ 
+ 	_g_.m.locks++
+ 	// Return the current cached span to the central lists.
+ 	s := c.alloc[sizeclass]
+ 	if s.freelist != nil {
+ 		gothrow("refill on a nonempty span")
+ 	}
+ 	if s != &emptymspan {
+ 		s.incache = false
+ 	}
+ 
+ 	// Get a new cached span from the central lists.
+ 	s = mCentral_CacheSpan(&mheap_.central[sizeclass].mcentral)
+ 	if s == nil {
+ 		gothrow("out of memory")
+ 	}
+ 	if s.freelist == nil {
+ 		println(s.ref, (s.npages<<_PageShift)/s.elemsize)
+ 		gothrow("empty span")
+ 	}
+ 	c.alloc[sizeclass] = s
+ 	_g_.m.locks--
+ 	return s
+ }
+ 
+ func mCache_ReleaseAll(c *mcache) {
+ 	for i := 0; i < _NumSizeClasses; i++ {
+ 		s := c.alloc[i]
+ 		if s != &emptymspan {
+ 			mCentral_UncacheSpan(&mheap_.central[i].mcentral, s)
+ 			c.alloc[i] = &emptymspan
+ 		}
+ 	}
+ }
diff --cc src/runtime/mgc.go
index 0000000000,f44d7ddbce..57bd8b3563
mode 000000,100644..100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@@ -1,0 -1,1798 +1,2422 @@@
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // TODO(rsc): The code having to do with the heap bitmap needs very serious cleanup.
+ // It has gotten completely out of control.
+ 
+ // Garbage collector (GC).
+ //
 -// GC is:
 -// - mark&sweep
 -// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
 -// - parallel (up to MaxGcproc threads)
 -// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
 -// - non-moving/non-compacting
 -// - full (non-partial)
++// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple GC
++// thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
++// non-generational and non-compacting. Allocation is done using size segregated per P allocation
++// areas to minimize fragmentation while eliminating locks in the common case.
+ //
 -// GC rate.
 -// Next GC is after we've allocated an extra amount of memory proportional to
 -// the amount already in use. The proportion is controlled by GOGC environment variable
 -// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 -// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
 -// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 -// (and also the amount of extra memory used).
++// The algorithm decomposes into several steps.
++// This is a high level description of the algorithm being used. For an overview of GC a good
++// place to start is Richard Jones' gchandbook.org.
++//
++// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
++// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
++// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978), 966-975.
++// For journal quality proofs that these steps are complete, correct, and terminate see
++// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
++// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
+ //
++//  0. Set phase = GCscan from GCoff.
++//  1. Wait for all P's to acknowledge phase change.
++//         At this point all goroutines have passed through a GC safepoint and
++//         know we are in the GCscan phase.
++//  2. GC scans all goroutine stacks, mark and enqueues all encountered pointers
++//       (marking avoids most duplicate enqueuing but races may produce duplication which is benign).
++//       Preempted goroutines are scanned before P schedules next goroutine.
++//  3. Set phase = GCmark.
++//  4. Wait for all P's to acknowledge phase change.
++//  5. Now write barrier marks and enqueues black, grey, or white to white pointers.
++//       Malloc still allocates white (non-marked) objects.
++//  6. Meanwhile GC transitively walks the heap marking reachable objects.
++//  7. When GC finishes marking heap, it preempts P's one-by-one and
++//       retakes partial wbufs (filled by write barrier or during a stack scan of the goroutine
++//       currently scheduled on the P).
++//  8. Once the GC has exhausted all available marking work it sets phase = marktermination.
++//  9. Wait for all P's to acknowledge phase change.
++// 10. Malloc now allocates black objects, so number of unmarked reachable objects
++//        monotonically decreases.
++// 11. GC preempts P's one-by-one taking partial wbufs and marks all unmarked yet reachable objects.
++// 12. When GC completes a full cycle over P's and discovers no new grey
++//         objects, (which means all reachable objects are marked) set phase = GCsweep.
++// 13. Wait for all P's to acknowledge phase change.
++// 14. Now malloc allocates white (but sweeps spans before use).
++//         Write barrier becomes nop.
++// 15. GC does background sweeping, see description below.
++// 16. When sweeping is complete set phase to GCoff.
++// 17. When sufficient allocation has taken place replay the sequence starting at 0 above,
++//         see discussion of GC rate below.
++
++// Changing phases.
++// Phases are changed by setting the gcphase to the next phase and possibly calling ackgcphase.
++// All phase action must be benign in the presence of a change.
++// Starting with GCoff
++// GCoff to GCscan
++//     GSscan scans stacks and globals greying them and never marks an object black.
++//     Once all the P's are aware of the new phase they will scan gs on preemption.
++//     This means that the scanning of preempted gs can't start until all the Ps
++//     have acknowledged.
++// GCscan to GCmark
++//     GCMark turns on the write barrier which also only greys objects. No scanning
++//     of objects (making them black) can happen until all the Ps have acknowledged
++//     the phase change.
++// GCmark to GCmarktermination
++//     The only change here is that we start allocating black so the Ps must acknowledge
++//     the change before we begin the termination algorithm
++// GCmarktermination to GSsweep
++//     Object currently on the freelist must be marked black for this to work.
++//     Are things on the free lists black or white? How does the sweep phase work?
++
+ // Concurrent sweep.
+ // The sweep phase proceeds concurrently with normal program execution.
+ // The heap is swept span-by-span both lazily (when a goroutine needs another span)
+ // and concurrently in a background goroutine (this helps programs that are not CPU bound).
+ // However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
+ // and so next_gc calculation is tricky and happens as follows.
+ // At the end of the stop-the-world phase next_gc is conservatively set based on total
+ // heap size; all spans are marked as "needs sweeping".
+ // Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
+ // The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
+ // closer to the target value. However, this is not enough to avoid over-allocating memory.
+ // Consider that a goroutine wants to allocate a new span for a large object and
+ // there are no free swept spans, but there are small-object unswept spans.
+ // If the goroutine naively allocates a new span, it can surpass the yet-unknown
+ // target next_gc value. In order to prevent such cases (1) when a goroutine needs
+ // to allocate a new small-object span, it sweeps small-object spans for the same
+ // object size until it frees at least one object; (2) when a goroutine needs to
+ // allocate large-object span from heap, it sweeps spans until it frees at least
+ // that many pages into heap. Together these two measures ensure that we don't surpass
+ // target next_gc value by a large margin. There is an exception: if a goroutine sweeps
+ // and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
+ // but there can still be other one-page unswept spans which could be combined into a two-page span.
+ // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+ // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+ // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+ // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+ // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+ // The finalizer goroutine is kicked off only when all spans are swept.
+ // When the next GC starts, it sweeps all not-yet-swept spans (if any).
+ 
++// GC rate.
++// Next GC is after we've allocated an extra amount of memory proportional to
++// the amount already in use. The proportion is controlled by GOGC environment variable
++// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
++// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
++// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
++// (and also the amount of extra memory used).
++
+ package runtime
+ 
+ import "unsafe"
+ 
+ const (
+ 	_DebugGC         = 0
+ 	_DebugGCPtrs     = false // if true, print trace of every pointer load during GC
+ 	_ConcurrentSweep = true
+ 
+ 	_WorkbufSize     = 4 * 1024
+ 	_FinBlockSize    = 4 * 1024
+ 	_RootData        = 0
+ 	_RootBss         = 1
+ 	_RootFinalizers  = 2
+ 	_RootSpans       = 3
+ 	_RootFlushCaches = 4
+ 	_RootCount       = 5
+ )
+ 
+ // ptrmask for an allocation containing a single pointer.
+ var oneptr = [...]uint8{bitsPointer}
+ 
 -// Initialized from $GOGC.  GOGC=off means no gc.
++// Initialized from $GOGC.  GOGC=off means no GC.
+ var gcpercent int32
+ 
+ // Holding worldsema grants an M the right to try to stop the world.
+ // The procedure is:
+ //
+ //	semacquire(&worldsema);
+ //	m.gcing = 1;
+ //	stoptheworld();
+ //
+ //	... do stuff ...
+ //
+ //	m.gcing = 0;
+ //	semrelease(&worldsema);
+ //	starttheworld();
+ //
+ var worldsema uint32 = 1
+ 
++// It is a bug if bits does not have bitBoundary set but
++// there are still some cases where this happens related
++// to stack spans.
++type markbits struct {
++	bitp  *byte   // pointer to the byte holding xbits
++	shift uintptr // bits xbits needs to be shifted to get bits
++	xbits byte    // byte holding all the bits from *bitp
++	bits  byte    // mark and boundary bits relevant to corresponding slot.
++	tbits byte    // pointer||scalar bits relevant to corresponding slot.
++}
++
+ type workbuf struct {
+ 	node lfnode // must be first
+ 	nobj uintptr
+ 	obj  [(_WorkbufSize - unsafe.Sizeof(lfnode{}) - ptrSize) / ptrSize]uintptr
+ }
+ 
+ var data, edata, bss, ebss, gcdata, gcbss struct{}
+ 
+ var finlock mutex  // protects the following variables
+ var fing *g        // goroutine that runs finalizers
+ var finq *finblock // list of finalizers that are to be executed
+ var finc *finblock // cache of free blocks
+ var finptrmask [_FinBlockSize / ptrSize / pointersPerByte]byte
+ var fingwait bool
+ var fingwake bool
+ var allfin *finblock // list of all blocks
+ 
+ var gcdatamask bitvector
+ var gcbssmask bitvector
+ 
+ var gclock mutex
+ 
+ var badblock [1024]uintptr
+ var nbadblock int32
+ 
+ type workdata struct {
+ 	full    uint64                // lock-free list of full blocks
+ 	empty   uint64                // lock-free list of empty blocks
++	partial uint64                // lock-free list of partially filled blocks
+ 	pad0    [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+ 	nproc   uint32
+ 	tstart  int64
+ 	nwait   uint32
+ 	ndone   uint32
+ 	alldone note
+ 	markfor *parfor
+ 
+ 	// Copy of mheap.allspans for marker or sweeper.
+ 	spans []*mspan
+ }
+ 
+ var work workdata
+ 
+ //go:linkname weak_cgo_allocate go.weak.runtime._cgo_allocate_internal
+ var weak_cgo_allocate byte
+ 
+ // Is _cgo_allocate linked into the binary?
+ func have_cgo_allocate() bool {
+ 	return &weak_cgo_allocate != nil
+ }
+ 
 -// scanblock scans a block of n bytes starting at pointer b for references
 -// to other objects, scanning any it finds recursively until there are no
 -// unscanned objects left.  Instead of using an explicit recursion, it keeps
 -// a work list in the Workbuf* structures and loops in the main function
 -// body.  Keeping an explicit work list is easier on the stack allocator and
 -// more efficient.
 -func scanblock(b, n uintptr, ptrmask *uint8) {
 -	// Cache memory arena parameters in local vars.
 -	arena_start := mheap_.arena_start
 -	arena_used := mheap_.arena_used
 -
 -	wbuf := getempty(nil)
 -	nobj := wbuf.nobj
 -	wp := &wbuf.obj[nobj]
 -	keepworking := b == 0
++// To help debug the concurrent GC we remark with the world
++// stopped ensuring that any object encountered has their normal
++// mark bit set. To do this we use an orthogonal bit
++// pattern to indicate the object is marked. The following pattern
++// uses the upper two bits in the object's bounday nibble.
++// 01: scalar  not marked
++// 10: pointer not marked
++// 11: pointer     marked
++// 00: scalar      marked
++// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
++// The higher bit is 1 for pointers and 0 for scalars, whether the object
++// is marked or not.
++// The first nibble no longer holds the bitsDead pattern indicating that the
++// there are no more pointers in the object. This information is held
++// in the second nibble.
++
++// When marking an object if the bool checkmark is true one uses the above
++// encoding, otherwise one uses the bitMarked bit in the lower two bits
++// of the nibble.
++var (
++	checkmark         = false
++	gccheckmarkenable = true
++)
+ 
 -	var ptrbitp unsafe.Pointer
++// Is address b in the known heap. If it doesn't have a valid gcmap
++// returns false. For example pointers into stacks will return false.
++func inheap(b uintptr) bool {
++	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
++		return false
++	}
++	// Not a beginning of a block, consult span table to find the block beginning.
++	k := b >> _PageShift
++	x := k
++	x -= mheap_.arena_start >> _PageShift
++	s := h_spans[x]
++	if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
++		return false
++	}
++	return true
++}
+ 
 -	// ptrmask can have 2 possible values:
 -	// 1. nil - obtain pointer mask from GC bitmap.
 -	// 2. pointer to a compact mask (for stacks and data).
 -	goto_scanobj := b != 0
++// Given an address in the heap return the relevant byte from the gcmap. This routine
++// can be used on addresses to the start of an object or to the interior of the an object.
++func slottombits(obj uintptr, mbits *markbits) {
++	off := (obj&^(ptrSize-1) - mheap_.arena_start) / ptrSize
++	mbits.bitp = (*byte)(unsafe.Pointer(mheap_.arena_start - off/wordsPerBitmapByte - 1))
++	mbits.shift = off % wordsPerBitmapByte * gcBits
++	mbits.xbits = *mbits.bitp
++	mbits.bits = (mbits.xbits >> mbits.shift) & bitMask
++	mbits.tbits = ((mbits.xbits >> mbits.shift) & bitPtrMask) >> 2
++}
+ 
++// b is a pointer into the heap.
++// Find the start of the object refered to by b.
++// Set mbits to the associated bits from the bit map.
++// If b is not a valid heap object return nil and
++// undefined values in mbits.
++func objectstart(b uintptr, mbits *markbits) uintptr {
++	obj := b &^ (ptrSize - 1)
+ 	for {
 -		if goto_scanobj {
 -			goto_scanobj = false
 -		} else {
 -			if nobj == 0 {
 -				// Out of work in workbuf.
 -				if !keepworking {
 -					putempty(wbuf)
 -					return
 -				}
++		slottombits(obj, mbits)
++		if mbits.bits&bitBoundary == bitBoundary {
++			break
++		}
+ 
 -				// Refill workbuf from global queue.
 -				wbuf = getfull(wbuf)
 -				if wbuf == nil {
 -					return
 -				}
 -				nobj = wbuf.nobj
 -				if nobj < uintptr(len(wbuf.obj)) {
 -					wp = &wbuf.obj[nobj]
 -				} else {
 -					wp = nil
 -				}
++		// Not a beginning of a block, consult span table to find the block beginning.
++		k := b >> _PageShift
++		x := k
++		x -= mheap_.arena_start >> _PageShift
++		s := h_spans[x]
++		if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
++			if s != nil && s.state == _MSpanStack {
++				return 0 // This is legit.
+ 			}
+ 
 -			// If another proc wants a pointer, give it some.
 -			if work.nwait > 0 && nobj > 4 && work.full == 0 {
 -				wbuf.nobj = nobj
 -				wbuf = handoff(wbuf)
 -				nobj = wbuf.nobj
 -				if nobj < uintptr(len(wbuf.obj)) {
 -					wp = &wbuf.obj[nobj]
++			// The following ensures that we are rigorous about what data
++			// structures hold valid pointers
++			if false {
++				// Still happens sometimes. We don't know why.
++				printlock()
++				print("runtime:objectstart Span weird: obj=", hex(obj), " k=", hex(k))
++				if s == nil {
++					print(" s=nil\n")
+ 				} else {
 -					wp = nil
++					print(" s.start=", hex(s.start<<_PageShift), " s.limit=", hex(s.limit), " s.state=", s.state, "\n")
+ 				}
++				printunlock()
++				gothrow("objectstart: bad pointer in unexpected span")
+ 			}
 -
 -			nobj--
 -			wp = &wbuf.obj[nobj]
 -			b = *wp
 -			n = arena_used - uintptr(b)
 -			ptrmask = nil // use GC bitmap for pointer info
++			return 0
+ 		}
+ 
 -		if _DebugGCPtrs {
 -			print("scanblock ", b, " +", hex(n), " ", ptrmask, "\n")
++		p := uintptr(s.start) << _PageShift
++		if s.sizeclass != 0 {
++			size := s.elemsize
++			idx := (obj - p) / size
++			p = p + idx*size
+ 		}
 -
 -		// Find bits of the beginning of the object.
 -		if ptrmask == nil {
 -			off := (uintptr(b) - arena_start) / ptrSize
 -			ptrbitp = unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1)
++		if p == obj {
++			print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", s.limit, "\n")
++			gothrow("failed to find block beginning")
+ 		}
++		obj = p
++	}
+ 
 -		var i uintptr
 -		for i = 0; i < n; i += ptrSize {
 -			// Find bits for this word.
 -			var bits uintptr
 -			if ptrmask == nil {
 -				// Check if we have reached end of span.
 -				if (uintptr(b)+i)%_PageSize == 0 &&
 -					h_spans[(uintptr(b)-arena_start)>>_PageShift] != h_spans[(uintptr(b)+i-arena_start)>>_PageShift] {
 -					break
 -				}
++	// if size(obj.firstfield) < PtrSize, the &obj.secondfield could map to the boundary bit
++	// Clear any low bits to get to the start of the object.
++	// greyobject depends on this.
++	return obj
++}
+ 
 -				// Consult GC bitmap.
 -				bits = uintptr(*(*byte)(ptrbitp))
++// Slow for now as we serialize this, since this is on a debug path
++// speed is not critical at this point.
++var andlock mutex
+ 
 -				if wordsPerBitmapByte != 2 {
 -					gothrow("alg doesn't work for wordsPerBitmapByte != 2")
 -				}
 -				j := (uintptr(b) + i) / ptrSize & 1
 -				ptrbitp = add(ptrbitp, -j)
 -				bits >>= gcBits * j
++func atomicand8(src *byte, val byte) {
++	lock(&andlock)
++	*src &= val
++	unlock(&andlock)
++}
+ 
 -				if bits&bitBoundary != 0 && i != 0 {
 -					break // reached beginning of the next object
 -				}
 -				bits = (bits >> 2) & bitsMask
 -				if bits == bitsDead {
 -					break // reached no-scan part of the object
 -				}
 -			} else {
 -				// dense mask (stack or data)
 -				bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
 -			}
++// Mark using the checkmark scheme.
++func docheckmark(mbits *markbits) {
++	// xor 01 moves 01(scalar unmarked) to 00(scalar marked)
++	// and 10(pointer unmarked) to 11(pointer marked)
++	if mbits.tbits == _BitsScalar {
++		atomicand8(mbits.bitp, ^byte(_BitsCheckMarkXor<<mbits.shift<<2))
++	} else if mbits.tbits == _BitsPointer {
++		atomicor8(mbits.bitp, byte(_BitsCheckMarkXor<<mbits.shift<<2))
++	}
+ 
 -			if bits <= _BitsScalar { // BitsScalar || BitsDead
 -				continue
 -			}
++	// reload bits for ischeckmarked
++	mbits.xbits = *mbits.bitp
++	mbits.bits = (mbits.xbits >> mbits.shift) & bitMask
++	mbits.tbits = ((mbits.xbits >> mbits.shift) & bitPtrMask) >> 2
++}
+ 
 -			if bits != _BitsPointer {
 -				gothrow("unexpected garbage collection bits")
 -			}
++// In the default scheme does mbits refer to a marked object.
++func ismarked(mbits *markbits) bool {
++	if mbits.bits&bitBoundary != bitBoundary {
++		gothrow("ismarked: bits should have boundary bit set")
++	}
++	return mbits.bits&bitMarked == bitMarked
++}
+ 
 -			obj := *(*uintptr)(unsafe.Pointer(b + i))
 -			obj0 := obj
++// In the checkmark scheme does mbits refer to a marked object.
++func ischeckmarked(mbits *markbits) bool {
++	if mbits.bits&bitBoundary != bitBoundary {
++		gothrow("ischeckmarked: bits should have boundary bit set")
++	}
++	return mbits.tbits == _BitsScalarMarked || mbits.tbits == _BitsPointerMarked
++}
+ 
 -		markobj:
 -			var s *mspan
 -			var off, bitp, shift, xbits uintptr
++// When in GCmarkterminate phase we allocate black.
++func gcmarknewobject_m(obj uintptr) {
++	if gcphase != _GCmarktermination {
++		gothrow("marking new object while not in mark termination phase")
++	}
++	if checkmark { // The world should be stopped so this should not happen.
++		gothrow("gcmarknewobject called while doing checkmark")
++	}
+ 
 -			// At this point we have extracted the next potential pointer.
 -			// Check if it points into heap.
 -			if obj == 0 {
 -				continue
 -			}
 -			if obj < arena_start || arena_used <= obj {
 -				if uintptr(obj) < _PhysPageSize && invalidptr != 0 {
 -					s = nil
 -					goto badobj
 -				}
 -				continue
 -			}
++	var mbits markbits
++	slottombits(obj, &mbits)
++	if mbits.bits&bitMarked != 0 {
++		return
++	}
+ 
 -			// Mark the object.
 -			obj &^= ptrSize - 1
 -			off = (obj - arena_start) / ptrSize
 -			bitp = arena_start - off/wordsPerBitmapByte - 1
 -			shift = (off % wordsPerBitmapByte) * gcBits
 -			xbits = uintptr(*(*byte)(unsafe.Pointer(bitp)))
 -			bits = (xbits >> shift) & bitMask
 -			if (bits & bitBoundary) == 0 {
 -				// Not a beginning of a block, consult span table to find the block beginning.
 -				k := pageID(obj >> _PageShift)
 -				x := k
 -				x -= pageID(arena_start >> _PageShift)
 -				s = h_spans[x]
 -				if s == nil || k < s.start || s.limit <= obj || s.state != mSpanInUse {
 -					// Stack pointers lie within the arena bounds but are not part of the GC heap.
 -					// Ignore them.
 -					if s != nil && s.state == _MSpanStack {
 -						continue
 -					}
 -					goto badobj
 -				}
 -				p := uintptr(s.start) << _PageShift
 -				if s.sizeclass != 0 {
 -					size := s.elemsize
 -					idx := (obj - p) / size
 -					p = p + idx*size
 -				}
 -				if p == obj {
 -					print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")
 -					gothrow("failed to find block beginning")
++	// Each byte of GC bitmap holds info for two words.
++	// If the current object is larger than two words, or if the object is one word
++	// but the object it shares the byte with is already marked,
++	// then all the possible concurrent updates are trying to set the same bit,
++	// so we can use a non-atomic update.
++	if mbits.xbits&(bitMask|(bitMask<<gcBits)) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
++		*mbits.bitp = mbits.xbits | bitMarked<<mbits.shift
++	} else {
++		atomicor8(mbits.bitp, bitMarked<<mbits.shift)
++	}
++}
++
++// obj is the start of an object with mark mbits.
++// If it isn't already marked, mark it and enqueue into workbuf.
++// Return possibly new workbuf to use.
++func greyobject(obj uintptr, mbits *markbits, wbuf *workbuf) *workbuf {
++	// obj should be start of allocation, and so must be at least pointer-aligned.
++	if obj&(ptrSize-1) != 0 {
++		gothrow("greyobject: obj not pointer-aligned")
++	}
++
++	if checkmark {
++		if !ismarked(mbits) {
++			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), ", mbits->bits=", hex(mbits.bits), " *mbits->bitp=", hex(*mbits.bitp), "\n")
++
++			k := obj >> _PageShift
++			x := k
++			x -= mheap_.arena_start >> _PageShift
++			s := h_spans[x]
++			printlock()
++			print("runtime:greyobject Span: obj=", hex(obj), " k=", hex(k))
++			if s == nil {
++				print(" s=nil\n")
++			} else {
++				print(" s.start=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
++				// NOTE(rsc): This code is using s.sizeclass as an approximation of the
++				// number of pointer-sized words in an object. Perhaps not what was intended.
++				for i := 0; i < int(s.sizeclass); i++ {
++					print(" *(obj+", i*ptrSize, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + uintptr(i)*ptrSize))), "\n")
+ 				}
 -				obj = p
 -				goto markobj
+ 			}
++			gothrow("checkmark found unmarked object")
++		}
++		if ischeckmarked(mbits) {
++			return wbuf
++		}
++		docheckmark(mbits)
++		if !ischeckmarked(mbits) {
++			print("mbits xbits=", hex(mbits.xbits), " bits=", hex(mbits.bits), " tbits=", hex(mbits.tbits), " shift=", mbits.shift, "\n")
++			gothrow("docheckmark and ischeckmarked disagree")
++		}
++	} else {
++		// If marked we have nothing to do.
++		if mbits.bits&bitMarked != 0 {
++			return wbuf
++		}
+ 
 -			if _DebugGCPtrs {
 -				print("scan *", hex(b+i), " = ", hex(obj0), " => base ", hex(obj), "\n")
 -			}
++		// Each byte of GC bitmap holds info for two words.
++		// If the current object is larger than two words, or if the object is one word
++		// but the object it shares the byte with is already marked,
++		// then all the possible concurrent updates are trying to set the same bit,
++		// so we can use a non-atomic update.
++		if mbits.xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
++			*mbits.bitp = mbits.xbits | bitMarked<<mbits.shift
++		} else {
++			atomicor8(mbits.bitp, bitMarked<<mbits.shift)
++		}
++	}
+ 
 -			if nbadblock > 0 && obj == badblock[nbadblock-1] {
 -				// Running garbage collection again because
 -				// we want to find the path from a root to a bad pointer.
 -				// Found possible next step; extend or finish path.
 -				for j := int32(0); j < nbadblock; j++ {
 -					if badblock[j] == b {
 -						goto AlreadyBad
 -					}
 -				}
 -				print("runtime: found *(", hex(b), "+", hex(i), ") = ", hex(obj0), "+", hex(obj-obj0), "\n")
 -				if ptrmask != nil {
 -					gothrow("bad pointer")
 -				}
 -				if nbadblock >= int32(len(badblock)) {
 -					gothrow("badblock trace too long")
 -				}
 -				badblock[nbadblock] = uintptr(b)
 -				nbadblock++
 -			AlreadyBad:
++	if !checkmark && (mbits.xbits>>(mbits.shift+2))&_BitsMask == _BitsDead {
++		return wbuf // noscan object
++	}
++
++	// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
++	// seems like a nice optimization that can be added back in.
++	// There needs to be time between the PREFETCH and the use.
++	// Previously we put the obj in an 8 element buffer that is drained at a rate
++	// to give the PREFETCH time to do its work.
++	// Use of PREFETCHNTA might be more appropriate than PREFETCH
++
++	// If workbuf is full, obtain an empty one.
++	if wbuf.nobj >= uintptr(len(wbuf.obj)) {
++		wbuf = getempty(wbuf)
++	}
++
++	wbuf.obj[wbuf.nobj] = obj
++	wbuf.nobj++
++	return wbuf
++}
++
++// Scan the object b of size n, adding pointers to wbuf.
++// Return possibly new wbuf to use.
++// If ptrmask != nil, it specifies where pointers are in b.
++// If ptrmask == nil, the GC bitmap should be consulted.
++// In this case, n may be an overestimate of the size; the GC bitmap
++// must also be used to make sure the scan stops at the end of b.
++func scanobject(b, n uintptr, ptrmask *uint8, wbuf *workbuf) *workbuf {
++	arena_start := mheap_.arena_start
++	arena_used := mheap_.arena_used
++
++	// Find bits of the beginning of the object.
++	var ptrbitp unsafe.Pointer
++	var mbits markbits
++	if ptrmask == nil {
++		b = objectstart(b, &mbits)
++		if b == 0 {
++			return wbuf
++		}
++		ptrbitp = unsafe.Pointer(mbits.bitp)
++	}
++	for i := uintptr(0); i < n; i += ptrSize {
++		// Find bits for this word.
++		var bits uintptr
++		if ptrmask != nil {
++			// dense mask (stack or data)
++			bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
++		} else {
++			// Check if we have reached end of span.
++			// n is an overestimate of the size of the object.
++			if (b+i)%_PageSize == 0 && h_spans[(b-arena_start)>>_PageShift] != h_spans[(b+i-arena_start)>>_PageShift] {
++				break
+ 			}
+ 
 -			// Now we have bits, bitp, and shift correct for
 -			// obj pointing at the base of the object.
 -			// Only care about not marked objects.
 -			if bits&bitMarked != 0 {
 -				continue
++			// Consult GC bitmap.
++			bits = uintptr(*(*byte)(ptrbitp))
++			if wordsPerBitmapByte != 2 {
++				gothrow("alg doesn't work for wordsPerBitmapByte != 2")
++			}
++			j := (uintptr(b) + i) / ptrSize & 1 // j indicates upper nibble or lower nibble
++			bits >>= gcBits * j
++			if i == 0 {
++				bits &^= bitBoundary
+ 			}
++			ptrbitp = add(ptrbitp, -j)
+ 
 -			// If obj size is greater than 8, then each byte of GC bitmap
 -			// contains info for at most one object. In such case we use
 -			// non-atomic byte store to mark the object. This can lead
 -			// to double enqueue of the object for scanning, but scanning
 -			// is an idempotent operation, so it is OK. This cannot lead
 -			// to bitmap corruption because the single marked bit is the
 -			// only thing that can change in the byte.
 -			// For 8-byte objects we use non-atomic store, if the other
 -			// quadruple is already marked. Otherwise we resort to CAS
 -			// loop for marking.
 -			if xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
 -				*(*byte)(unsafe.Pointer(bitp)) = uint8(xbits | bitMarked<<shift)
 -			} else {
 -				atomicor8((*byte)(unsafe.Pointer(bitp)), bitMarked<<shift)
++			if bits&bitBoundary != 0 && i != 0 {
++				break // reached beginning of the next object
+ 			}
++			bits = (bits & bitPtrMask) >> 2 // bits refer to the type bits.
+ 
 -			if (xbits>>(shift+2))&bitsMask == bitsDead {
 -				continue // noscan object
++			if i != 0 && bits == bitsDead { // BitsDead in first nibble not valid during checkmark
++				break // reached no-scan part of the object
+ 			}
++		}
+ 
 -			// Queue the obj for scanning.
 -			// TODO: PREFETCH here.
++		if bits <= _BitsScalar { // _BitsScalar, _BitsDead, _BitsScalarMarked
++			continue
++		}
+ 
 -			// If workbuf is full, obtain an empty one.
 -			if nobj >= uintptr(len(wbuf.obj)) {
 -				wbuf.nobj = nobj
 -				wbuf = getempty(wbuf)
 -				nobj = wbuf.nobj
 -				wp = &wbuf.obj[nobj]
 -			}
 -			*wp = obj
 -			nobj++
 -			if nobj < uintptr(len(wbuf.obj)) {
 -				wp = &wbuf.obj[nobj]
 -			} else {
 -				wp = nil
 -			}
++		if bits&_BitsPointer != _BitsPointer {
++			print("gc checkmark=", checkmark, " b=", hex(b), " ptrmask=", ptrmask, " mbits.bitp=", mbits.bitp, " mbits.xbits=", hex(mbits.xbits), " bits=", hex(bits), "\n")
++			gothrow("unexpected garbage collection bits")
++		}
++
++		obj := *(*uintptr)(unsafe.Pointer(b + i))
++
++		// At this point we have extracted the next potential pointer.
++		// Check if it points into heap.
++		if obj == 0 || obj < arena_start || obj >= arena_used {
+ 			continue
++		}
+ 
 -		badobj:
 -			// If cgo_allocate is linked into the binary, it can allocate
 -			// memory as []unsafe.Pointer that may not contain actual
 -			// pointers and must be scanned conservatively.
 -			// In this case alone, allow the bad pointer.
 -			if have_cgo_allocate() && ptrmask == nil {
 -				continue
 -			}
++		// Mark the object. return some important bits.
++		// We we combine the following two rotines we don't have to pass mbits or obj around.
++		var mbits markbits
++		obj = objectstart(obj, &mbits)
++		if obj == 0 {
++			continue
++		}
++		wbuf = greyobject(obj, &mbits, wbuf)
++	}
++	return wbuf
++}
+ 
 -			// Anything else indicates a bug somewhere.
 -			// If we're in the middle of chasing down a different bad pointer,
 -			// don't confuse the trace by printing about this one.
 -			if nbadblock > 0 {
 -				continue
++// scanblock starts by scanning b as scanobject would.
++// If the gcphase is GCscan, that's all scanblock does.
++// Otherwise it traverses some fraction of the pointers it found in b, recursively.
++// As a special case, scanblock(nil, 0, nil) means to scan previously queued work,
++// stopping only when no work is left in the system.
++func scanblock(b, n uintptr, ptrmask *uint8) {
++	wbuf := getpartialorempty()
++	if b != 0 {
++		wbuf = scanobject(b, n, ptrmask, wbuf)
++		if gcphase == _GCscan {
++			if inheap(b) && ptrmask == nil {
++				// b is in heap, we are in GCscan so there should be a ptrmask.
++				gothrow("scanblock: In GCscan phase and inheap is true.")
+ 			}
++			// GCscan only goes one level deep since mark wb not turned on.
++			putpartial(wbuf)
++			return
++		}
++	}
++	if gcphase == _GCscan {
++		gothrow("scanblock: In GCscan phase but no b passed in.")
++	}
+ 
 -			print("runtime: garbage collector found invalid heap pointer *(", hex(b), "+", hex(i), ")=", hex(obj))
 -			if s == nil {
 -				print(" s=nil\n")
 -			} else {
 -				print(" span=", uintptr(s.start)<<_PageShift, "-", s.limit, "-", (uintptr(s.start)+s.npages)<<_PageShift, " state=", s.state, "\n")
++	keepworking := b == 0
++
++	// ptrmask can have 2 possible values:
++	// 1. nil - obtain pointer mask from GC bitmap.
++	// 2. pointer to a compact mask (for stacks and data).
++	for {
++		if wbuf.nobj == 0 {
++			if !keepworking {
++				putempty(wbuf)
++				return
+ 			}
 -			if ptrmask != nil {
 -				gothrow("invalid heap pointer")
++			// Refill workbuf from global queue.
++			wbuf = getfull(wbuf)
++			if wbuf == nil { // nil means out of work barrier reached
++				return
+ 			}
 -			// Add to badblock list, which will cause the garbage collection
 -			// to keep repeating until it has traced the chain of pointers
 -			// leading to obj all the way back to a root.
 -			if nbadblock == 0 {
 -				badblock[nbadblock] = uintptr(b)
 -				nbadblock++
++
++			if wbuf.nobj <= 0 {
++				gothrow("runtime:scanblock getfull returns empty buffer")
+ 			}
+ 		}
 -		if _DebugGCPtrs {
 -			print("end scanblock ", hex(b), " +", hex(n), " ", ptrmask, "\n")
 -		}
 -		if _DebugGC > 0 && ptrmask == nil {
 -			// For heap objects ensure that we did not overscan.
 -			var p, n uintptr
 -			if mlookup(b, &p, &n, nil) == 0 || b != p || i > n {
 -				print("runtime: scanned (", hex(b), "+", hex(i), "), heap object (", hex(p), "+", hex(n), ")\n")
 -				gothrow("scanblock: scanned invalid object")
 -			}
++
++		// If another proc wants a pointer, give it some.
++		if work.nwait > 0 && wbuf.nobj > 4 && work.full == 0 {
++			wbuf = handoff(wbuf)
+ 		}
++
++		// This might be a good place to add prefetch code...
++		// if(wbuf->nobj > 4) {
++		//         PREFETCH(wbuf->obj[wbuf->nobj - 3];
++		//  }
++		wbuf.nobj--
++		b = wbuf.obj[wbuf.nobj]
++		wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf)
+ 	}
+ }
+ 
+ func markroot(desc *parfor, i uint32) {
+ 	// Note: if you add a case here, please also update heapdump.c:dumproots.
+ 	switch i {
+ 	case _RootData:
+ 		scanblock(uintptr(unsafe.Pointer(&data)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)), gcdatamask.bytedata)
+ 
+ 	case _RootBss:
+ 		scanblock(uintptr(unsafe.Pointer(&bss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)), gcbssmask.bytedata)
+ 
+ 	case _RootFinalizers:
+ 		for fb := allfin; fb != nil; fb = fb.alllink {
+ 			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0])
+ 		}
+ 
+ 	case _RootSpans:
+ 		// mark MSpan.specials
+ 		sg := mheap_.sweepgen
+ 		for spanidx := uint32(0); spanidx < uint32(len(work.spans)); spanidx++ {
+ 			s := work.spans[spanidx]
+ 			if s.state != mSpanInUse {
+ 				continue
+ 			}
 -			if s.sweepgen != sg {
++			if !checkmark && s.sweepgen != sg {
++				// sweepgen was updated (+2) during non-checkmark GC pass
+ 				print("sweep ", s.sweepgen, " ", sg, "\n")
+ 				gothrow("gc: unswept span")
+ 			}
+ 			for sp := s.specials; sp != nil; sp = sp.next {
+ 				if sp.kind != _KindSpecialFinalizer {
+ 					continue
+ 				}
+ 				// don't mark finalized object, but scan it so we
+ 				// retain everything it points to.
+ 				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+ 				// A finalizer can be set for an inner byte of an object, find object beginning.
+ 				p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
 -				scanblock(p, s.elemsize, nil)
++				if gcphase != _GCscan {
++					scanblock(p, s.elemsize, nil) // scanned during mark phase
++				}
+ 				scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptr[0])
+ 			}
+ 		}
+ 
+ 	case _RootFlushCaches:
 -		flushallmcaches()
++		if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
++			flushallmcaches()
++		}
+ 
+ 	default:
+ 		// the rest is scanning goroutine stacks
+ 		if uintptr(i-_RootCount) >= allglen {
+ 			gothrow("markroot: bad index")
+ 		}
+ 		gp := allgs[i-_RootCount]
++
+ 		// remember when we've first observed the G blocked
+ 		// needed only to output in traceback
 -		status := readgstatus(gp)
++		status := readgstatus(gp) // We are not in a scan state
+ 		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
+ 			gp.waitsince = work.tstart
+ 		}
 -		// Shrink a stack if not much of it is being used.
 -		shrinkstack(gp)
++
++		// Shrink a stack if not much of it is being used but not in the scan phase.
++		if gcphase != _GCscan { // Do not shrink during GCscan phase.
++			shrinkstack(gp)
++		}
+ 		if readgstatus(gp) == _Gdead {
+ 			gp.gcworkdone = true
+ 		} else {
+ 			gp.gcworkdone = false
+ 		}
+ 		restart := stopg(gp)
 -		scanstack(gp)
++
++		// goroutine will scan its own stack when it stops running.
++		// Wait until it has.
++		for readgstatus(gp) == _Grunning && !gp.gcworkdone {
++		}
++
++		// scanstack(gp) is done as part of gcphasework
++		// But to make sure we finished we need to make sure that
++		// the stack traps have all responded so drop into
++		// this while loop until they respond.
++		for !gp.gcworkdone {
++			status = readgstatus(gp)
++			if status == _Gdead {
++				gp.gcworkdone = true // scan is a noop
++				break
++			}
++			if status == _Gwaiting || status == _Grunnable {
++				restart = stopg(gp)
++			}
++		}
+ 		if restart {
+ 			restartg(gp)
+ 		}
+ 	}
+ }
+ 
+ // Get an empty work buffer off the work.empty list,
+ // allocating new buffers as needed.
+ func getempty(b *workbuf) *workbuf {
 -	_g_ := getg()
+ 	if b != nil {
 -		lfstackpush(&work.full, &b.node)
++		putfull(b)
++		b = nil
+ 	}
 -	b = nil
 -	c := _g_.m.mcache
 -	if c.gcworkbuf != nil {
 -		b = (*workbuf)(c.gcworkbuf)
 -		c.gcworkbuf = nil
 -	}
 -	if b == nil {
++	if work.empty != 0 {
+ 		b = (*workbuf)(lfstackpop(&work.empty))
+ 	}
++	if b != nil && b.nobj != 0 {
++		_g_ := getg()
++		print("m", _g_.m.id, ": getempty: popped b=", b, " with non-zero b.nobj=", b.nobj, "\n")
++		gothrow("getempty: workbuffer not empty, b->nobj not 0")
++	}
+ 	if b == nil {
+ 		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), _CacheLineSize, &memstats.gc_sys))
++		b.nobj = 0
+ 	}
 -	b.nobj = 0
+ 	return b
+ }
+ 
+ func putempty(b *workbuf) {
 -	_g_ := getg()
 -	c := _g_.m.mcache
 -	if c.gcworkbuf == nil {
 -		c.gcworkbuf = (unsafe.Pointer)(b)
 -		return
++	if b.nobj != 0 {
++		gothrow("putempty: b->nobj not 0")
+ 	}
+ 	lfstackpush(&work.empty, &b.node)
+ }
+ 
 -func gcworkbuffree(b unsafe.Pointer) {
 -	if b != nil {
 -		putempty((*workbuf)(b))
++func putfull(b *workbuf) {
++	if b.nobj <= 0 {
++		gothrow("putfull: b->nobj <= 0")
++	}
++	lfstackpush(&work.full, &b.node)
++}
++
++// Get an partially empty work buffer
++// if none are available get an empty one.
++func getpartialorempty() *workbuf {
++	b := (*workbuf)(lfstackpop(&work.partial))
++	if b == nil {
++		b = getempty(nil)
+ 	}
++	return b
+ }
+ 
 -// Get a full work buffer off the work.full list, or return nil.
++func putpartial(b *workbuf) {
++	if b.nobj == 0 {
++		lfstackpush(&work.empty, &b.node)
++	} else if b.nobj < uintptr(len(b.obj)) {
++		lfstackpush(&work.partial, &b.node)
++	} else if b.nobj == uintptr(len(b.obj)) {
++		lfstackpush(&work.full, &b.node)
++	} else {
++		print("b=", b, " b.nobj=", b.nobj, " len(b.obj)=", len(b.obj), "\n")
++		gothrow("putpartial: bad Workbuf b.nobj")
++	}
++}
++
++// Get a full work buffer off the work.full or a partially
++// filled one off the work.partial list. If nothing is available
++// wait until all the other gc helpers have finished and then
++// return nil.
++// getfull acts as a barrier for work.nproc helpers. As long as one
++// gchelper is actively marking objects it
++// may create a workbuffer that the other helpers can work on.
++// The for loop either exits when a work buffer is found
++// or when _all_ of the work.nproc GC helpers are in the loop
++// looking for work and thus not capable of creating new work.
++// This is in fact the termination condition for the STW mark
++// phase.
+ func getfull(b *workbuf) *workbuf {
+ 	if b != nil {
 -		lfstackpush(&work.empty, &b.node)
++		putempty(b)
+ 	}
++
+ 	b = (*workbuf)(lfstackpop(&work.full))
++	if b == nil {
++		b = (*workbuf)(lfstackpop(&work.partial))
++	}
+ 	if b != nil || work.nproc == 1 {
+ 		return b
+ 	}
+ 
+ 	xadd(&work.nwait, +1)
+ 	for i := 0; ; i++ {
+ 		if work.full != 0 {
+ 			xadd(&work.nwait, -1)
+ 			b = (*workbuf)(lfstackpop(&work.full))
++			if b == nil {
++				b = (*workbuf)(lfstackpop(&work.partial))
++			}
+ 			if b != nil {
+ 				return b
+ 			}
+ 			xadd(&work.nwait, +1)
+ 		}
+ 		if work.nwait == work.nproc {
+ 			return nil
+ 		}
+ 		_g_ := getg()
+ 		if i < 10 {
+ 			_g_.m.gcstats.nprocyield++
+ 			procyield(20)
+ 		} else if i < 20 {
+ 			_g_.m.gcstats.nosyield++
+ 			osyield()
+ 		} else {
+ 			_g_.m.gcstats.nsleep++
+ 			usleep(100)
+ 		}
+ 	}
+ }
+ 
+ func handoff(b *workbuf) *workbuf {
+ 	// Make new buffer with half of b's pointers.
+ 	b1 := getempty(nil)
+ 	n := b.nobj / 2
+ 	b.nobj -= n
+ 	b1.nobj = n
+ 	memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), n*unsafe.Sizeof(b1.obj[0]))
+ 	_g_ := getg()
+ 	_g_.m.gcstats.nhandoff++
+ 	_g_.m.gcstats.nhandoffcnt += uint64(n)
+ 
+ 	// Put b on full list - let first half of b get stolen.
+ 	lfstackpush(&work.full, &b.node)
+ 	return b1
+ }
+ 
+ func stackmapdata(stkmap *stackmap, n int32) bitvector {
+ 	if n < 0 || n >= stkmap.n {
+ 		gothrow("stackmapdata: index out of range")
+ 	}
+ 	return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+31)/32*4))))}
+ }
+ 
+ // Scan a stack frame: local variables and function arguments/results.
+ func scanframe(frame *stkframe, unused unsafe.Pointer) bool {
+ 
+ 	f := frame.fn
+ 	targetpc := frame.continpc
+ 	if targetpc == 0 {
+ 		// Frame is dead.
+ 		return true
+ 	}
+ 	if _DebugGC > 1 {
+ 		print("scanframe ", gofuncname(f), "\n")
+ 	}
+ 	if targetpc != f.entry {
+ 		targetpc--
+ 	}
+ 	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+ 	if pcdata == -1 {
+ 		// We do not have a valid pcdata value but there might be a
+ 		// stackmap for this function.  It is likely that we are looking
+ 		// at the function prologue, assume so and hope for the best.
+ 		pcdata = 0
+ 	}
+ 
+ 	// Scan local variables if stack frame has been allocated.
+ 	size := frame.varp - frame.sp
+ 	var minsize uintptr
+ 	if thechar != '6' && thechar != '8' {
+ 		minsize = ptrSize
+ 	} else {
+ 		minsize = 0
+ 	}
+ 	if size > minsize {
+ 		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+ 		if stkmap == nil || stkmap.n <= 0 {
+ 			print("runtime: frame ", gofuncname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
+ 			gothrow("missing stackmap")
+ 		}
+ 
+ 		// Locals bitmap information, scan just the pointers in locals.
+ 		if pcdata < 0 || pcdata >= stkmap.n {
+ 			// don't know where we are
+ 			print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " locals stack map entries for ", gofuncname(f), " (targetpc=", targetpc, ")\n")
+ 			gothrow("scanframe: bad symbol table")
+ 		}
+ 		bv := stackmapdata(stkmap, pcdata)
+ 		size = (uintptr(bv.n) * ptrSize) / bitsPerPointer
+ 		scanblock(frame.varp-size, uintptr(bv.n)/bitsPerPointer*ptrSize, bv.bytedata)
+ 	}
+ 
+ 	// Scan arguments.
+ 	if frame.arglen > 0 {
+ 		var bv bitvector
+ 		if frame.argmap != nil {
+ 			bv = *frame.argmap
+ 		} else {
+ 			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+ 			if stkmap == nil || stkmap.n <= 0 {
+ 				print("runtime: frame ", gofuncname(f), " untyped args ", hex(frame.argp), "+", hex(frame.arglen), "\n")
+ 				gothrow("missing stackmap")
+ 			}
+ 			if pcdata < 0 || pcdata >= stkmap.n {
+ 				// don't know where we are
+ 				print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " args stack map entries for ", gofuncname(f), " (targetpc=", targetpc, ")\n")
+ 				gothrow("scanframe: bad symbol table")
+ 			}
+ 			bv = stackmapdata(stkmap, pcdata)
+ 		}
+ 		scanblock(frame.argp, uintptr(bv.n)/bitsPerPointer*ptrSize, bv.bytedata)
+ 	}
+ 	return true
+ }
+ 
+ func scanstack(gp *g) {
+ 	// TODO(rsc): Due to a precedence error, this was never checked in the original C version.
+ 	// If you enable the check, the gothrow happens.
+ 	/*
+ 		if readgstatus(gp)&_Gscan == 0 {
+ 			print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+ 			gothrow("mark - bad status")
+ 		}
+ 	*/
+ 
+ 	switch readgstatus(gp) &^ _Gscan {
+ 	default:
+ 		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+ 		gothrow("mark - bad status")
+ 	case _Gdead:
+ 		return
+ 	case _Grunning:
+ 		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
 -		gothrow("mark - world not stopped")
++		gothrow("scanstack: goroutine not stopped")
+ 	case _Grunnable, _Gsyscall, _Gwaiting:
+ 		// ok
+ 	}
+ 
+ 	if gp == getg() {
+ 		gothrow("can't scan our own stack")
+ 	}
+ 	mp := gp.m
+ 	if mp != nil && mp.helpgc != 0 {
+ 		gothrow("can't scan gchelper stack")
+ 	}
+ 
+ 	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
+ 	tracebackdefers(gp, scanframe, nil)
+ }
+ 
 -// The gp has been moved to a gc safepoint. If there is gcphase specific
 -// work it is done here.
++// If the slot is grey or black return true, if white return false.
++// If the slot is not in the known heap and thus does not have a valid GC bitmap then
++// it is considered grey. Globals and stacks can hold such slots.
++// The slot is grey if its mark bit is set and it is enqueued to be scanned.
++// The slot is black if it has already been scanned.
++// It is white if it has a valid mark bit and the bit is not set.
++func shaded(slot uintptr) bool {
++	if !inheap(slot) { // non-heap slots considered grey
++		return true
++	}
++
++	var mbits markbits
++	valid := objectstart(slot, &mbits)
++	if valid == 0 {
++		return true
++	}
++
++	if checkmark {
++		return ischeckmarked(&mbits)
++	}
++
++	return mbits.bits&bitMarked != 0
++}
++
++// Shade the object if it isn't already.
++// The object is not nil and known to be in the heap.
++func shade(b uintptr) {
++	if !inheap(b) {
++		gothrow("shade: passed an address not in the heap")
++	}
++
++	wbuf := getpartialorempty()
++	// Mark the object, return some important bits.
++	// If we combine the following two rotines we don't have to pass mbits or obj around.
++	var mbits markbits
++	obj := objectstart(b, &mbits)
++	if obj != 0 {
++		wbuf = greyobject(obj, &mbits, wbuf) // augments the wbuf
++	}
++	putpartial(wbuf)
++}
++
++// This is the Dijkstra barrier coarsened to always shade the ptr (dst) object.
++// The original Dijkstra barrier only shaded ptrs being placed in black slots.
++//
++// Shade indicates that it has seen a white pointer by adding the referent
++// to wbuf as well as marking it.
++//
++// slot is the destination (dst) in go code
++// ptr is the value that goes into the slot (src) in the go code
++//
++// Dijkstra pointed out that maintaining the no black to white
++// pointers means that white to white pointers not need
++// to be noted by the write barrier. Furthermore if either
++// white object dies before it is reached by the
++// GC then the object can be collected during this GC cycle
++// instead of waiting for the next cycle. Unfortunately the cost of
++// ensure that the object holding the slot doesn't concurrently
++// change to black without the mutator noticing seems prohibitive.
++//
++// Consider the following example where the mutator writes into
++// a slot and then loads the slot's mark bit while the GC thread
++// writes to the slot's mark bit and then as part of scanning reads
++// the slot.
++//
++// Initially both [slot] and [slotmark] are 0 (nil)
++// Mutator thread          GC thread
++// st [slot], ptr          st [slotmark], 1
++//
++// ld r1, [slotmark]       ld r2, [slot]
++//
++// This is a classic example of independent reads of independent writes,
++// aka IRIW. The question is if r1==r2==0 is allowed and for most HW the
++// answer is yes without inserting a memory barriers between the st and the ld.
++// These barriers are expensive so we have decided that we will
++// always grey the ptr object regardless of the slot's color.
++func gcmarkwb_m(slot *uintptr, ptr uintptr) {
++	switch gcphase {
++	default:
++		gothrow("gcphasework in bad gcphase")
++
++	case _GCoff, _GCquiesce, _GCstw, _GCsweep, _GCscan:
++		// ok
++
++	case _GCmark, _GCmarktermination:
++		if ptr != 0 && inheap(ptr) {
++			shade(ptr)
++		}
++	}
++}
++
++// The gp has been moved to a GC safepoint. GC phase specific
++// work is done here.
+ func gcphasework(gp *g) {
+ 	switch gcphase {
+ 	default:
+ 		gothrow("gcphasework in bad gcphase")
+ 	case _GCoff, _GCquiesce, _GCstw, _GCsweep:
 -		// No work for now.
++		// No work.
++	case _GCscan:
++		// scan the stack, mark the objects, put pointers in work buffers
++		// hanging off the P where this is being run.
++		scanstack(gp)
+ 	case _GCmark:
 -		// Disabled until concurrent GC is implemented
 -		// but indicate the scan has been done.
 -		// scanstack(gp);
++		// No work.
++	case _GCmarktermination:
++		scanstack(gp)
++		// All available mark work will be emptied before returning.
+ 	}
+ 	gp.gcworkdone = true
+ }
+ 
+ var finalizer1 = [...]byte{
+ 	// Each Finalizer is 5 words, ptr ptr uintptr ptr ptr.
+ 	// Each byte describes 4 words.
+ 	// Need 4 Finalizers described by 5 bytes before pattern repeats:
+ 	//	ptr ptr uintptr ptr ptr
+ 	//	ptr ptr uintptr ptr ptr
+ 	//	ptr ptr uintptr ptr ptr
+ 	//	ptr ptr uintptr ptr ptr
+ 	// aka
+ 	//	ptr ptr uintptr ptr
+ 	//	ptr ptr ptr uintptr
+ 	//	ptr ptr ptr ptr
+ 	//	uintptr ptr ptr ptr
+ 	//	ptr uintptr ptr ptr
+ 	// Assumptions about Finalizer layout checked below.
+ 	bitsPointer | bitsPointer<<2 | bitsScalar<<4 | bitsPointer<<6,
+ 	bitsPointer | bitsPointer<<2 | bitsPointer<<4 | bitsScalar<<6,
+ 	bitsPointer | bitsPointer<<2 | bitsPointer<<4 | bitsPointer<<6,
+ 	bitsScalar | bitsPointer<<2 | bitsPointer<<4 | bitsPointer<<6,
+ 	bitsPointer | bitsScalar<<2 | bitsPointer<<4 | bitsPointer<<6,
+ }
+ 
+ func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot *ptrtype) {
+ 	lock(&finlock)
+ 	if finq == nil || finq.cnt == finq.cap {
+ 		if finc == nil {
+ 			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+ 			finc.cap = int32((_FinBlockSize-unsafe.Sizeof(finblock{}))/unsafe.Sizeof(finalizer{}) + 1)
+ 			finc.alllink = allfin
+ 			allfin = finc
+ 			if finptrmask[0] == 0 {
+ 				// Build pointer mask for Finalizer array in block.
+ 				// Check assumptions made in finalizer1 array above.
+ 				if (unsafe.Sizeof(finalizer{}) != 5*ptrSize ||
+ 					unsafe.Offsetof(finalizer{}.fn) != 0 ||
+ 					unsafe.Offsetof(finalizer{}.arg) != ptrSize ||
+ 					unsafe.Offsetof(finalizer{}.nret) != 2*ptrSize ||
+ 					unsafe.Offsetof(finalizer{}.fint) != 3*ptrSize ||
+ 					unsafe.Offsetof(finalizer{}.ot) != 4*ptrSize ||
+ 					bitsPerPointer != 2) {
+ 					gothrow("finalizer out of sync")
+ 				}
+ 				for i := range finptrmask {
+ 					finptrmask[i] = finalizer1[i%len(finalizer1)]
+ 				}
+ 			}
+ 		}
+ 		block := finc
+ 		finc = block.next
+ 		block.next = finq
+ 		finq = block
+ 	}
+ 	f := (*finalizer)(add(unsafe.Pointer(&finq.fin[0]), uintptr(finq.cnt)*unsafe.Sizeof(finq.fin[0])))
+ 	finq.cnt++
+ 	f.fn = fn
+ 	f.nret = nret
+ 	f.fint = fint
+ 	f.ot = ot
+ 	f.arg = p
+ 	fingwake = true
+ 	unlock(&finlock)
+ }
+ 
+ func iterate_finq(callback func(*funcval, unsafe.Pointer, uintptr, *_type, *ptrtype)) {
+ 	for fb := allfin; fb != nil; fb = fb.alllink {
+ 		for i := int32(0); i < fb.cnt; i++ {
+ 			f := &fb.fin[i]
+ 			callback(f.fn, f.arg, f.nret, f.fint, f.ot)
+ 		}
+ 	}
+ }
+ 
++// Returns only when span s has been swept.
+ func mSpan_EnsureSwept(s *mspan) {
+ 	// Caller must disable preemption.
+ 	// Otherwise when this function returns the span can become unswept again
+ 	// (if GC is triggered on another goroutine).
+ 	_g_ := getg()
+ 	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+ 		gothrow("MSpan_EnsureSwept: m is not locked")
+ 	}
+ 
+ 	sg := mheap_.sweepgen
+ 	if atomicload(&s.sweepgen) == sg {
+ 		return
+ 	}
++	// The caller must be sure that the span is a MSpanInUse span.
+ 	if cas(&s.sweepgen, sg-2, sg-1) {
+ 		mSpan_Sweep(s, false)
+ 		return
+ 	}
+ 	// unfortunate condition, and we don't have efficient means to wait
+ 	for atomicload(&s.sweepgen) != sg {
+ 		osyield()
+ 	}
+ }
+ 
+ // Sweep frees or collects finalizers for blocks not marked in the mark phase.
+ // It clears the mark bits in preparation for the next GC round.
+ // Returns true if the span was returned to heap.
+ // If preserve=true, don't return it to heap nor relink in MCentral lists;
+ // caller takes care of it.
+ func mSpan_Sweep(s *mspan, preserve bool) bool {
++	if checkmark {
++		gothrow("MSpan_Sweep: checkmark only runs in STW and after the sweep")
++	}
++
+ 	// It's critical that we enter this function with preemption disabled,
+ 	// GC must not start while we are in the middle of this function.
+ 	_g_ := getg()
+ 	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+ 		gothrow("MSpan_Sweep: m is not locked")
+ 	}
+ 	sweepgen := mheap_.sweepgen
+ 	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+ 		print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+ 		gothrow("MSpan_Sweep: bad span state")
+ 	}
+ 	arena_start := mheap_.arena_start
+ 	cl := s.sizeclass
+ 	size := s.elemsize
+ 	var n int32
+ 	var npages int32
+ 	if cl == 0 {
+ 		n = 1
+ 	} else {
+ 		// Chunk full of small blocks.
+ 		npages = class_to_allocnpages[cl]
+ 		n = (npages << _PageShift) / int32(size)
+ 	}
+ 	res := false
+ 	nfree := 0
+ 	var head mlink
+ 	end := &head
+ 	c := _g_.m.mcache
+ 	sweepgenset := false
+ 
+ 	// Mark any free objects in this span so we don't collect them.
+ 	for link := s.freelist; link != nil; link = link.next {
+ 		off := (uintptr(unsafe.Pointer(link)) - arena_start) / ptrSize
+ 		bitp := arena_start - off/wordsPerBitmapByte - 1
+ 		shift := (off % wordsPerBitmapByte) * gcBits
+ 		*(*byte)(unsafe.Pointer(bitp)) |= bitMarked << shift
+ 	}
+ 
+ 	// Unlink & free special records for any objects we're about to free.
+ 	specialp := &s.specials
+ 	special := *specialp
+ 	for special != nil {
+ 		// A finalizer can be set for an inner byte of an object, find object beginning.
+ 		p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size
+ 		off := (p - arena_start) / ptrSize
+ 		bitp := arena_start - off/wordsPerBitmapByte - 1
+ 		shift := (off % wordsPerBitmapByte) * gcBits
+ 		bits := (*(*byte)(unsafe.Pointer(bitp)) >> shift) & bitMask
+ 		if bits&bitMarked == 0 {
+ 			// Find the exact byte for which the special was setup
+ 			// (as opposed to object beginning).
+ 			p := uintptr(s.start<<_PageShift) + uintptr(special.offset)
+ 			// about to free object: splice out special record
+ 			y := special
+ 			special = special.next
+ 			*specialp = special
+ 			if !freespecial(y, unsafe.Pointer(p), size, false) {
+ 				// stop freeing of object if it has a finalizer
+ 				*(*byte)(unsafe.Pointer(bitp)) |= bitMarked << shift
+ 			}
+ 		} else {
+ 			// object is still live: keep special record
+ 			specialp = &special.next
+ 			special = *specialp
+ 		}
+ 	}
+ 
+ 	// Sweep through n objects of given size starting at p.
+ 	// This thread owns the span now, so it can manipulate
+ 	// the block bitmap without atomic operations.
+ 	p := uintptr(s.start << _PageShift)
+ 	off := (p - arena_start) / ptrSize
+ 	bitp := arena_start - off/wordsPerBitmapByte - 1
+ 	shift := uint(0)
+ 	step := size / (ptrSize * wordsPerBitmapByte)
+ 	// Rewind to the previous quadruple as we move to the next
+ 	// in the beginning of the loop.
+ 	bitp += step
+ 	if step == 0 {
+ 		// 8-byte objects.
+ 		bitp++
+ 		shift = gcBits
+ 	}
+ 	for ; n > 0; n, p = n-1, p+size {
+ 		bitp -= step
+ 		if step == 0 {
+ 			if shift != 0 {
+ 				bitp--
+ 			}
+ 			shift = gcBits - shift
+ 		}
+ 
+ 		xbits := *(*byte)(unsafe.Pointer(bitp))
+ 		bits := (xbits >> shift) & bitMask
+ 
+ 		// Allocated and marked object, reset bits to allocated.
+ 		if bits&bitMarked != 0 {
+ 			*(*byte)(unsafe.Pointer(bitp)) &^= bitMarked << shift
+ 			continue
+ 		}
+ 
+ 		// At this point we know that we are looking at garbage object
+ 		// that needs to be collected.
+ 		if debug.allocfreetrace != 0 {
+ 			tracefree(unsafe.Pointer(p), size)
+ 		}
+ 
+ 		// Reset to allocated+noscan.
+ 		*(*byte)(unsafe.Pointer(bitp)) = uint8(uintptr(xbits&^((bitMarked|bitsMask<<2)<<shift)) | uintptr(bitsDead)<<(shift+2))
+ 		if cl == 0 {
+ 			// Free large span.
+ 			if preserve {
+ 				gothrow("can't preserve large span")
+ 			}
+ 			unmarkspan(p, s.npages<<_PageShift)
+ 			s.needzero = 1
+ 
+ 			// important to set sweepgen before returning it to heap
+ 			atomicstore(&s.sweepgen, sweepgen)
+ 			sweepgenset = true
+ 
+ 			// NOTE(rsc,dvyukov): The original implementation of efence
+ 			// in CL 22060046 used SysFree instead of SysFault, so that
+ 			// the operating system would eventually give the memory
+ 			// back to us again, so that an efence program could run
+ 			// longer without running out of memory. Unfortunately,
+ 			// calling SysFree here without any kind of adjustment of the
+ 			// heap data structures means that when the memory does
+ 			// come back to us, we have the wrong metadata for it, either in
+ 			// the MSpan structures or in the garbage collection bitmap.
+ 			// Using SysFault here means that the program will run out of
+ 			// memory fairly quickly in efence mode, but at least it won't
+ 			// have mysterious crashes due to confused memory reuse.
+ 			// It should be possible to switch back to SysFree if we also
+ 			// implement and then call some kind of MHeap_DeleteSpan.
+ 			if debug.efence > 0 {
+ 				s.limit = 0 // prevent mlookup from finding this span
+ 				sysFault(unsafe.Pointer(p), size)
+ 			} else {
+ 				mHeap_Free(&mheap_, s, 1)
+ 			}
+ 			c.local_nlargefree++
+ 			c.local_largefree += size
+ 			xadd64(&memstats.next_gc, -int64(size)*int64(gcpercent+100)/100)
+ 			res = true
+ 		} else {
+ 			// Free small object.
+ 			if size > 2*ptrSize {
+ 				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed"
+ 			} else if size > ptrSize {
+ 				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = 0
+ 			}
+ 			end.next = (*mlink)(unsafe.Pointer(p))
+ 			end = end.next
+ 			nfree++
+ 		}
+ 	}
+ 
+ 	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+ 	// because of the potential for a concurrent free/SetFinalizer.
+ 	// But we need to set it before we make the span available for allocation
+ 	// (return it to heap or mcentral), because allocation code assumes that a
+ 	// span is already swept if available for allocation.
+ 	if !sweepgenset && nfree == 0 {
+ 		// The span must be in our exclusive ownership until we update sweepgen,
+ 		// check for potential races.
+ 		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+ 			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+ 			gothrow("MSpan_Sweep: bad span state after sweep")
+ 		}
+ 		atomicstore(&s.sweepgen, sweepgen)
+ 	}
+ 	if nfree > 0 {
+ 		c.local_nsmallfree[cl] += uintptr(nfree)
+ 		c.local_cachealloc -= intptr(uintptr(nfree) * size)
+ 		xadd64(&memstats.next_gc, -int64(nfree)*int64(size)*int64(gcpercent+100)/100)
+ 		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head.next, end, preserve)
+ 		// MCentral_FreeSpan updates sweepgen
+ 	}
+ 	return res
+ }
+ 
+ // State of background sweep.
+ // Protected by gclock.
+ type sweepdata struct {
+ 	g       *g
+ 	parked  bool
+ 	started bool
+ 
+ 	spanidx uint32 // background sweeper position
+ 
+ 	nbgsweep    uint32
+ 	npausesweep uint32
+ }
+ 
+ var sweep sweepdata
+ 
+ // sweeps one span
+ // returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
+ func sweepone() uintptr {
+ 	_g_ := getg()
+ 
+ 	// increment locks to ensure that the goroutine is not preempted
+ 	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
+ 	_g_.m.locks++
+ 	sg := mheap_.sweepgen
+ 	for {
+ 		idx := xadd(&sweep.spanidx, 1) - 1
+ 		if idx >= uint32(len(work.spans)) {
+ 			mheap_.sweepdone = 1
+ 			_g_.m.locks--
+ 			return ^uintptr(0)
+ 		}
+ 		s := work.spans[idx]
+ 		if s.state != mSpanInUse {
+ 			s.sweepgen = sg
+ 			continue
+ 		}
+ 		if s.sweepgen != sg-2 || !cas(&s.sweepgen, sg-2, sg-1) {
+ 			continue
+ 		}
+ 		npages := s.npages
+ 		if !mSpan_Sweep(s, false) {
+ 			npages = 0
+ 		}
+ 		_g_.m.locks--
+ 		return npages
+ 	}
+ }
+ 
+ func gosweepone() uintptr {
+ 	var ret uintptr
+ 	systemstack(func() {
+ 		ret = sweepone()
+ 	})
+ 	return ret
+ }
+ 
+ func gosweepdone() bool {
+ 	return mheap_.sweepdone != 0
+ }
+ 
+ func gchelper() {
+ 	_g_ := getg()
+ 	_g_.m.traceback = 2
+ 	gchelperstart()
+ 
 -	// parallel mark for over gc roots
++	// parallel mark for over GC roots
+ 	parfordo(work.markfor)
 -
 -	// help other threads scan secondary blocks
 -	scanblock(0, 0, nil)
++	if gcphase != _GCscan {
++		scanblock(0, 0, nil) // blocks in getfull
++	}
+ 
+ 	nproc := work.nproc // work.nproc can change right after we increment work.ndone
+ 	if xadd(&work.ndone, +1) == nproc-1 {
+ 		notewakeup(&work.alldone)
+ 	}
+ 	_g_.m.traceback = 0
+ }
+ 
+ func cachestats() {
+ 	for i := 0; ; i++ {
+ 		p := allp[i]
+ 		if p == nil {
+ 			break
+ 		}
+ 		c := p.mcache
+ 		if c == nil {
+ 			continue
+ 		}
+ 		purgecachedstats(c)
+ 	}
+ }
+ 
+ func flushallmcaches() {
+ 	for i := 0; ; i++ {
+ 		p := allp[i]
+ 		if p == nil {
+ 			break
+ 		}
+ 		c := p.mcache
+ 		if c == nil {
+ 			continue
+ 		}
+ 		mCache_ReleaseAll(c)
+ 		stackcache_clear(c)
+ 	}
+ }
+ 
+ func updatememstats(stats *gcstats) {
+ 	if stats != nil {
+ 		*stats = gcstats{}
+ 	}
+ 	for mp := allm; mp != nil; mp = mp.alllink {
+ 		if stats != nil {
+ 			src := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(&mp.gcstats))
+ 			dst := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(stats))
+ 			for i, v := range src {
+ 				dst[i] += v
+ 			}
+ 			mp.gcstats = gcstats{}
+ 		}
+ 	}
+ 
+ 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
+ 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
+ 	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
+ 		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
+ 
+ 	// Calculate memory allocator stats.
+ 	// During program execution we only count number of frees and amount of freed memory.
+ 	// Current number of alive object in the heap and amount of alive heap memory
+ 	// are calculated by scanning all spans.
+ 	// Total number of mallocs is calculated as number of frees plus number of alive objects.
+ 	// Similarly, total amount of allocated memory is calculated as amount of freed memory
+ 	// plus amount of alive heap memory.
+ 	memstats.alloc = 0
+ 	memstats.total_alloc = 0
+ 	memstats.nmalloc = 0
+ 	memstats.nfree = 0
+ 	for i := 0; i < len(memstats.by_size); i++ {
+ 		memstats.by_size[i].nmalloc = 0
+ 		memstats.by_size[i].nfree = 0
+ 	}
+ 
+ 	// Flush MCache's to MCentral.
+ 	systemstack(flushallmcaches)
+ 
+ 	// Aggregate local stats.
+ 	cachestats()
+ 
+ 	// Scan all spans and count number of alive objects.
+ 	lock(&mheap_.lock)
+ 	for i := uint32(0); i < mheap_.nspan; i++ {
+ 		s := h_allspans[i]
+ 		if s.state != mSpanInUse {
+ 			continue
+ 		}
+ 		if s.sizeclass == 0 {
+ 			memstats.nmalloc++
+ 			memstats.alloc += uint64(s.elemsize)
+ 		} else {
+ 			memstats.nmalloc += uint64(s.ref)
+ 			memstats.by_size[s.sizeclass].nmalloc += uint64(s.ref)
+ 			memstats.alloc += uint64(s.ref) * uint64(s.elemsize)
+ 		}
+ 	}
+ 	unlock(&mheap_.lock)
+ 
+ 	// Aggregate by size class.
+ 	smallfree := uint64(0)
+ 	memstats.nfree = mheap_.nlargefree
+ 	for i := 0; i < len(memstats.by_size); i++ {
+ 		memstats.nfree += mheap_.nsmallfree[i]
+ 		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
+ 		memstats.by_size[i].nmalloc += mheap_.nsmallfree[i]
+ 		smallfree += uint64(mheap_.nsmallfree[i]) * uint64(class_to_size[i])
+ 	}
+ 	memstats.nfree += memstats.tinyallocs
+ 	memstats.nmalloc += memstats.nfree
+ 
+ 	// Calculate derived stats.
+ 	memstats.total_alloc = uint64(memstats.alloc) + uint64(mheap_.largefree) + smallfree
+ 	memstats.heap_alloc = memstats.alloc
+ 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
+ }
+ 
+ func gcinit() {
+ 	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
+ 		gothrow("runtime: size of Workbuf is suboptimal")
+ 	}
+ 
+ 	work.markfor = parforalloc(_MaxGcproc)
+ 	gcpercent = readgogc()
+ 	gcdatamask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcdata)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)))
+ 	gcbssmask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcbss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)))
+ }
+ 
++// Called from malloc.go using onM, stopping and starting the world handled in caller.
+ func gc_m(start_time int64, eagersweep bool) {
+ 	_g_ := getg()
+ 	gp := _g_.m.curg
+ 	casgstatus(gp, _Grunning, _Gwaiting)
+ 	gp.waitreason = "garbage collection"
+ 
+ 	gc(start_time, eagersweep)
++	casgstatus(gp, _Gwaiting, _Grunning)
++}
++
++// Similar to clearcheckmarkbits but works on a single span.
++// It preforms two tasks.
++// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
++//    for nibbles with the BoundaryBit set.
++// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and
++//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
++// For the second case it is possible to restore the BitsDead pattern but since
++// clearmark is a debug tool performance has a lower priority than simplicity.
++// The span is MSpanInUse and the world is stopped.
++func clearcheckmarkbitsspan(s *mspan) {
++	if s.state != _MSpanInUse {
++		print("runtime:clearcheckmarkbitsspan: state=", s.state, "\n")
++		gothrow("clearcheckmarkbitsspan: bad span state")
++	}
+ 
 -	if nbadblock > 0 {
 -		// Work out path from root to bad block.
 -		for {
 -			gc(start_time, eagersweep)
 -			if nbadblock >= int32(len(badblock)) {
 -				gothrow("cannot find path to bad pointer")
++	arena_start := mheap_.arena_start
++	cl := s.sizeclass
++	size := s.elemsize
++	var n int32
++	if cl == 0 {
++		n = 1
++	} else {
++		// Chunk full of small blocks
++		npages := class_to_allocnpages[cl]
++		n = npages << _PageShift / int32(size)
++	}
++
++	// MSpan_Sweep has similar code but instead of overloading and
++	// complicating that routine we do a simpler walk here.
++	// Sweep through n objects of given size starting at p.
++	// This thread owns the span now, so it can manipulate
++	// the block bitmap without atomic operations.
++	p := uintptr(s.start) << _PageShift
++
++	// Find bits for the beginning of the span.
++	off := (p - arena_start) / ptrSize
++	bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
++	step := size / (ptrSize * wordsPerBitmapByte)
++
++	// The type bit values are:
++	//	00 - BitsDead, for us BitsScalarMarked
++	//	01 - BitsScalar
++	//	10 - BitsPointer
++	//	11 - unused, for us BitsPointerMarked
++	//
++	// When called to prepare for the checkmark phase (checkmark==1),
++	// we change BitsDead to BitsScalar, so that there are no BitsScalarMarked
++	// type bits anywhere.
++	//
++	// The checkmark phase marks by changing BitsScalar to BitsScalarMarked
++	// and BitsPointer to BitsPointerMarked.
++	//
++	// When called to clean up after the checkmark phase (checkmark==0),
++	// we unmark by changing BitsScalarMarked back to BitsScalar and
++	// BitsPointerMarked back to BitsPointer.
++	//
++	// There are two problems with the scheme as just described.
++	// First, the setup rewrites BitsDead to BitsScalar, but the type bits
++	// following a BitsDead are uninitialized and must not be used.
++	// Second, objects that are free are expected to have their type
++	// bits zeroed (BitsDead), so in the cleanup we need to restore
++	// any BitsDeads that were there originally.
++	//
++	// In a one-word object (8-byte allocation on 64-bit system),
++	// there is no difference between BitsScalar and BitsDead, because
++	// neither is a pointer and there are no more words in the object,
++	// so using BitsScalar during the checkmark is safe and mapping
++	// both back to BitsDead during cleanup is also safe.
++	//
++	// In a larger object, we need to be more careful. During setup,
++	// if the type of the first word is BitsDead, we change it to BitsScalar
++	// (as we must) but also initialize the type of the second
++	// word to BitsDead, so that a scan during the checkmark phase
++	// will still stop before seeing the uninitialized type bits in the
++	// rest of the object. The sequence 'BitsScalar BitsDead' never
++	// happens in real type bitmaps - BitsDead is always as early
++	// as possible, so immediately after the last BitsPointer.
++	// During cleanup, if we see a BitsScalar, we can check to see if it
++	// is followed by BitsDead. If so, it was originally BitsDead and
++	// we can change it back.
++
++	if step == 0 {
++		// updating top and bottom nibbles, all boundaries
++		for i := int32(0); i < n/2; i, bitp = i+1, addb(bitp, uintptrMask&-1) {
++			if *bitp&bitBoundary == 0 {
++				gothrow("missing bitBoundary")
++			}
++			b := (*bitp & bitPtrMask) >> 2
++			if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) {
++				*bitp &^= 0x0c // convert to _BitsDead
++			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
++				*bitp &^= _BitsCheckMarkXor << 2
++			}
++
++			if (*bitp>>gcBits)&bitBoundary == 0 {
++				gothrow("missing bitBoundary")
++			}
++			b = ((*bitp >> gcBits) & bitPtrMask) >> 2
++			if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) {
++				*bitp &^= 0xc0 // convert to _BitsDead
++			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
++				*bitp &^= _BitsCheckMarkXor << (2 + gcBits)
++			}
++		}
++	} else {
++		// updating bottom nibble for first word of each object
++		for i := int32(0); i < n; i, bitp = i+1, addb(bitp, -step) {
++			if *bitp&bitBoundary == 0 {
++				gothrow("missing bitBoundary")
++			}
++			b := (*bitp & bitPtrMask) >> 2
++
++			if checkmark && b == _BitsDead {
++				// move BitsDead into second word.
++				// set bits to BitsScalar in preparation for checkmark phase.
++				*bitp &^= 0xc0
++				*bitp |= _BitsScalar << 2
++			} else if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) && *bitp&0xc0 == 0 {
++				// Cleaning up after checkmark phase.
++				// First word is scalar or dead (we forgot)
++				// and second word is dead.
++				// First word might as well be dead too.
++				*bitp &^= 0x0c
++			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
++				*bitp ^= _BitsCheckMarkXor << 2
+ 			}
+ 		}
+ 	}
++}
+ 
 -	casgstatus(gp, _Gwaiting, _Grunning)
++// clearcheckmarkbits preforms two tasks.
++// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
++//    for nibbles with the BoundaryBit set.
++// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and
++//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
++// This is a bit expensive but preserves the BitsDead encoding during the normal marking.
++// BitsDead remains valid for every nibble except the ones with BitsBoundary set.
++func clearcheckmarkbits() {
++	for _, s := range work.spans {
++		if s.state == _MSpanInUse {
++			clearcheckmarkbitsspan(s)
++		}
++	}
++}
++
++// Called from malloc.go using onM.
++// The world is stopped. Rerun the scan and mark phases
++// using the bitMarkedCheck bit instead of the
++// bitMarked bit. If the marking encounters an
++// bitMarked bit that is not set then we throw.
++func gccheckmark_m(startTime int64, eagersweep bool) {
++	if !gccheckmarkenable {
++		return
++	}
++
++	if checkmark {
++		gothrow("gccheckmark_m, entered with checkmark already true")
++	}
++
++	checkmark = true
++	clearcheckmarkbits()        // Converts BitsDead to BitsScalar.
++	gc_m(startTime, eagersweep) // turns off checkmark
++	// Work done, fixed up the GC bitmap to remove the checkmark bits.
++	clearcheckmarkbits()
++}
++
++func gccheckmarkenable_m() {
++	gccheckmarkenable = true
++}
++
++func gccheckmarkdisable_m() {
++	gccheckmarkenable = false
++}
++
++func finishsweep_m() {
++	// The world is stopped so we should be able to complete the sweeps
++	// quickly.
++	for sweepone() != ^uintptr(0) {
++		sweep.npausesweep++
++	}
++
++	// There may be some other spans being swept concurrently that
++	// we need to wait for. If finishsweep_m is done with the world stopped
++	// this code is not required.
++	sg := mheap_.sweepgen
++	for _, s := range work.spans {
++		if s.sweepgen != sg && s.state == _MSpanInUse {
++			mSpan_EnsureSwept(s)
++		}
++	}
++}
++
++// Scan all of the stacks, greying (or graying if in America) the referents
++// but not blackening them since the mark write barrier isn't installed.
++func gcscan_m() {
++	_g_ := getg()
++
++	// Grab the g that called us and potentially allow rescheduling.
++	// This allows it to be scanned like other goroutines.
++	mastergp := _g_.m.curg
++	casgstatus(mastergp, _Grunning, _Gwaiting)
++	mastergp.waitreason = "garbage collection scan"
++
++	// Span sweeping has been done by finishsweep_m.
++	// Long term we will want to make this goroutine runnable
++	// by placing it onto a scanenqueue state and then calling
++	// runtimeÂ·restartg(mastergp) to make it Grunnable.
++	// At the bottom we will want to return this p back to the scheduler.
++	oldphase := gcphase
++
++	// Prepare flag indicating that the scan has not been completed.
++	lock(&allglock)
++	local_allglen := allglen
++	for i := uintptr(0); i < local_allglen; i++ {
++		gp := allgs[i]
++		gp.gcworkdone = false // set to true in gcphasework
++	}
++	unlock(&allglock)
++
++	work.nwait = 0
++	work.ndone = 0
++	work.nproc = 1 // For now do not do this in parallel.
++	gcphase = _GCscan
++	//	ackgcphase is not needed since we are not scanning running goroutines.
++	parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), nil, false, markroot)
++	parfordo(work.markfor)
++
++	lock(&allglock)
++	// Check that gc work is done.
++	for i := uintptr(0); i < local_allglen; i++ {
++		gp := allgs[i]
++		if !gp.gcworkdone {
++			gothrow("scan missed a g")
++		}
++	}
++	unlock(&allglock)
++
++	gcphase = oldphase
++	casgstatus(mastergp, _Gwaiting, _Grunning)
++	// Let the g that called us continue to run.
++}
++
++// Mark all objects that are known about.
++func gcmark_m() {
++	scanblock(0, 0, nil)
++}
++
++// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
++// all go routines see the new barrier.
++func gcinstallmarkwb_m() {
++	gcphase = _GCmark
++}
++
++// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
++// all go routines see the new barrier.
++func gcinstalloffwb_m() {
++	gcphase = _GCoff
+ }
+ 
+ func gc(start_time int64, eagersweep bool) {
+ 	if _DebugGCPtrs {
+ 		print("GC start\n")
+ 	}
+ 
+ 	if debug.allocfreetrace > 0 {
+ 		tracegc()
+ 	}
+ 
+ 	_g_ := getg()
+ 	_g_.m.traceback = 2
+ 	t0 := start_time
+ 	work.tstart = start_time
+ 
+ 	var t1 int64
+ 	if debug.gctrace > 0 {
+ 		t1 = nanotime()
+ 	}
+ 
 -	// Sweep what is not sweeped by bgsweep.
 -	for sweepone() != ^uintptr(0) {
 -		sweep.npausesweep++
++	if !checkmark {
++		finishsweep_m() // skip during checkmark debug phase.
+ 	}
+ 
+ 	// Cache runtime.mheap_.allspans in work.spans to avoid conflicts with
+ 	// resizing/freeing allspans.
+ 	// New spans can be created while GC progresses, but they are not garbage for
+ 	// this round:
+ 	//  - new stack spans can be created even while the world is stopped.
+ 	//  - new malloc spans can be created during the concurrent sweep
+ 
+ 	// Even if this is stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
+ 	lock(&mheap_.lock)
+ 	// Free the old cached sweep array if necessary.
+ 	if work.spans != nil && &work.spans[0] != &h_allspans[0] {
+ 		sysFree(unsafe.Pointer(&work.spans[0]), uintptr(len(work.spans))*unsafe.Sizeof(work.spans[0]), &memstats.other_sys)
+ 	}
+ 	// Cache the current array for marking.
+ 	mheap_.gcspans = mheap_.allspans
+ 	work.spans = h_allspans
+ 	unlock(&mheap_.lock)
++	oldphase := gcphase
+ 
+ 	work.nwait = 0
+ 	work.ndone = 0
+ 	work.nproc = uint32(gcprocs())
++	gcphase = _GCmarktermination
++
++	// World is stopped so allglen will not change.
++	for i := uintptr(0); i < allglen; i++ {
++		gp := allgs[i]
++		gp.gcworkdone = false // set to true in gcphasework
++	}
++
+ 	parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), nil, false, markroot)
+ 	if work.nproc > 1 {
+ 		noteclear(&work.alldone)
+ 		helpgc(int32(work.nproc))
+ 	}
+ 
+ 	var t2 int64
+ 	if debug.gctrace > 0 {
+ 		t2 = nanotime()
+ 	}
+ 
+ 	gchelperstart()
+ 	parfordo(work.markfor)
+ 	scanblock(0, 0, nil)
+ 
++	if work.full != 0 {
++		gothrow("work.full != 0")
++	}
++	if work.partial != 0 {
++		gothrow("work.partial != 0")
++	}
++
++	gcphase = oldphase
+ 	var t3 int64
+ 	if debug.gctrace > 0 {
+ 		t3 = nanotime()
+ 	}
+ 
+ 	if work.nproc > 1 {
+ 		notesleep(&work.alldone)
+ 	}
+ 
+ 	shrinkfinish()
+ 
+ 	cachestats()
+ 	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
+ 	// estimate what was live heap size after previous GC (for printing only)
+ 	heap0 := memstats.next_gc * 100 / (uint64(gcpercent) + 100)
+ 	// conservatively set next_gc to high value assuming that everything is live
+ 	// concurrent/lazy sweep will reduce this number while discovering new garbage
+ 	memstats.next_gc = memstats.heap_alloc + memstats.heap_alloc*uint64(gcpercent)/100
+ 
+ 	t4 := nanotime()
+ 	atomicstore64(&memstats.last_gc, uint64(unixnanotime())) // must be Unix time to make sense to user
+ 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(t4 - t0)
+ 	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(t4)
+ 	memstats.pause_total_ns += uint64(t4 - t0)
+ 	memstats.numgc++
+ 	if memstats.debuggc {
+ 		print("pause ", t4-t0, "\n")
+ 	}
+ 
+ 	if debug.gctrace > 0 {
+ 		heap1 := memstats.heap_alloc
+ 		var stats gcstats
+ 		updatememstats(&stats)
+ 		if heap1 != memstats.heap_alloc {
+ 			print("runtime: mstats skew: heap=", heap1, "/", memstats.heap_alloc, "\n")
+ 			gothrow("mstats skew")
+ 		}
+ 		obj := memstats.nmalloc - memstats.nfree
+ 
+ 		stats.nprocyield += work.markfor.nprocyield
+ 		stats.nosyield += work.markfor.nosyield
+ 		stats.nsleep += work.markfor.nsleep
+ 
+ 		print("gc", memstats.numgc, "(", work.nproc, "): ",
+ 			(t1-t0)/1000, "+", (t2-t1)/1000, "+", (t3-t2)/1000, "+", (t4-t3)/1000, " us, ",
+ 			heap0>>20, " -> ", heap1>>20, " MB, ",
+ 			obj, " (", memstats.nmalloc, "-", memstats.nfree, ") objects, ",
+ 			gcount(), " goroutines, ",
+ 			len(work.spans), "/", sweep.nbgsweep, "/", sweep.npausesweep, " sweeps, ",
+ 			stats.nhandoff, "(", stats.nhandoffcnt, ") handoff, ",
+ 			work.markfor.nsteal, "(", work.markfor.nstealcnt, ") steal, ",
+ 			stats.nprocyield, "/", stats.nosyield, "/", stats.nsleep, " yields\n")
+ 		sweep.nbgsweep = 0
+ 		sweep.npausesweep = 0
+ 	}
+ 
+ 	// See the comment in the beginning of this function as to why we need the following.
+ 	// Even if this is still stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
+ 	lock(&mheap_.lock)
+ 	// Free the old cached mark array if necessary.
+ 	if work.spans != nil && &work.spans[0] != &h_allspans[0] {
+ 		sysFree(unsafe.Pointer(&work.spans[0]), uintptr(len(work.spans))*unsafe.Sizeof(work.spans[0]), &memstats.other_sys)
+ 	}
+ 
++	if gccheckmarkenable {
++		if !checkmark {
++			// first half of two-pass; don't set up sweep
++			unlock(&mheap_.lock)
++			return
++		}
++		checkmark = false // done checking marks
++	}
++
+ 	// Cache the current array for sweeping.
+ 	mheap_.gcspans = mheap_.allspans
+ 	mheap_.sweepgen += 2
+ 	mheap_.sweepdone = 0
+ 	work.spans = h_allspans
+ 	sweep.spanidx = 0
+ 	unlock(&mheap_.lock)
+ 
+ 	if _ConcurrentSweep && !eagersweep {
+ 		lock(&gclock)
+ 		if !sweep.started {
+ 			go bgsweep()
+ 			sweep.started = true
+ 		} else if sweep.parked {
+ 			sweep.parked = false
+ 			ready(sweep.g)
+ 		}
+ 		unlock(&gclock)
+ 	} else {
+ 		// Sweep all spans eagerly.
+ 		for sweepone() != ^uintptr(0) {
+ 			sweep.npausesweep++
+ 		}
+ 		// Do an additional mProf_GC, because all 'free' events are now real as well.
+ 		mProf_GC()
+ 	}
+ 
+ 	mProf_GC()
+ 	_g_.m.traceback = 0
+ 
+ 	if _DebugGCPtrs {
+ 		print("GC end\n")
+ 	}
+ }
+ 
+ func readmemstats_m(stats *MemStats) {
+ 	updatememstats(nil)
+ 
+ 	// Size of the trailing by_size array differs between Go and C,
+ 	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+ 	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
+ 
+ 	// Stack numbers are part of the heap numbers, separate those out for user consumption
+ 	stats.StackSys = stats.StackInuse
+ 	stats.HeapInuse -= stats.StackInuse
+ 	stats.HeapSys -= stats.StackInuse
+ }
+ 
+ //go:linkname readGCStats runtime/debug.readGCStats
+ func readGCStats(pauses *[]uint64) {
+ 	systemstack(func() {
+ 		readGCStats_m(pauses)
+ 	})
+ }
+ 
+ func readGCStats_m(pauses *[]uint64) {
+ 	p := *pauses
+ 	// Calling code in runtime/debug should make the slice large enough.
+ 	if cap(p) < len(memstats.pause_ns)+3 {
+ 		gothrow("runtime: short slice passed to readGCStats")
+ 	}
+ 
+ 	// Pass back: pauses, pause ends, last gc (absolute time), number of gc, total pause ns.
+ 	lock(&mheap_.lock)
+ 
+ 	n := memstats.numgc
+ 	if n > uint32(len(memstats.pause_ns)) {
+ 		n = uint32(len(memstats.pause_ns))
+ 	}
+ 
+ 	// The pause buffer is circular. The most recent pause is at
+ 	// pause_ns[(numgc-1)%len(pause_ns)], and then backward
+ 	// from there to go back farther in time. We deliver the times
+ 	// most recent first (in p[0]).
+ 	p = p[:cap(p)]
+ 	for i := uint32(0); i < n; i++ {
+ 		j := (memstats.numgc - 1 - i) % uint32(len(memstats.pause_ns))
+ 		p[i] = memstats.pause_ns[j]
+ 		p[n+i] = memstats.pause_end[j]
+ 	}
+ 
+ 	p[n+n] = memstats.last_gc
+ 	p[n+n+1] = uint64(memstats.numgc)
+ 	p[n+n+2] = memstats.pause_total_ns
+ 	unlock(&mheap_.lock)
+ 	*pauses = p[:n+n+3]
+ }
+ 
+ func setGCPercent(in int32) (out int32) {
+ 	lock(&mheap_.lock)
+ 	out = gcpercent
+ 	if in < 0 {
+ 		in = -1
+ 	}
+ 	gcpercent = in
+ 	unlock(&mheap_.lock)
+ 	return out
+ }
+ 
+ func gchelperstart() {
+ 	_g_ := getg()
+ 
+ 	if _g_.m.helpgc < 0 || _g_.m.helpgc >= _MaxGcproc {
+ 		gothrow("gchelperstart: bad m->helpgc")
+ 	}
+ 	if _g_ != _g_.m.g0 {
+ 		gothrow("gchelper not running on g0 stack")
+ 	}
+ }
+ 
+ func wakefing() *g {
+ 	var res *g
+ 	lock(&finlock)
+ 	if fingwait && fingwake {
+ 		fingwait = false
+ 		fingwake = false
+ 		res = fing
+ 	}
+ 	unlock(&finlock)
+ 	return res
+ }
+ 
+ func addb(p *byte, n uintptr) *byte {
+ 	return (*byte)(add(unsafe.Pointer(p), n))
+ }
+ 
+ // Recursively unrolls GC program in prog.
+ // mask is where to store the result.
+ // ppos is a pointer to position in mask, in bits.
+ // sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
+ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool) *byte {
+ 	arena_start := mheap_.arena_start
+ 	pos := *ppos
+ 	mask := (*[1 << 30]byte)(unsafe.Pointer(maskp))
+ 	for {
+ 		switch *prog {
+ 		default:
+ 			gothrow("unrollgcprog: unknown instruction")
+ 
+ 		case insData:
+ 			prog = addb(prog, 1)
+ 			siz := int(*prog)
+ 			prog = addb(prog, 1)
+ 			p := (*[1 << 30]byte)(unsafe.Pointer(prog))
+ 			for i := 0; i < siz; i++ {
+ 				v := p[i/_PointersPerByte]
+ 				v >>= (uint(i) % _PointersPerByte) * _BitsPerPointer
+ 				v &= _BitsMask
+ 				if inplace {
+ 					// Store directly into GC bitmap.
+ 					off := (uintptr(unsafe.Pointer(&mask[pos])) - arena_start) / ptrSize
+ 					bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+ 					shift := (off % wordsPerBitmapByte) * gcBits
+ 					if shift == 0 {
+ 						*bitp = 0
+ 					}
+ 					*bitp |= v << (shift + 2)
+ 					pos += ptrSize
+ 				} else if sparse {
+ 					// 4-bits per word
+ 					v <<= (pos % 8) + 2
+ 					mask[pos/8] |= v
+ 					pos += gcBits
+ 				} else {
+ 					// 2-bits per word
+ 					v <<= pos % 8
+ 					mask[pos/8] |= v
+ 					pos += _BitsPerPointer
+ 				}
+ 			}
+ 			prog = addb(prog, round(uintptr(siz)*_BitsPerPointer, 8)/8)
+ 
+ 		case insArray:
+ 			prog = (*byte)(add(unsafe.Pointer(prog), 1))
+ 			siz := uintptr(0)
+ 			for i := uintptr(0); i < ptrSize; i++ {
+ 				siz = (siz << 8) + uintptr(*(*byte)(add(unsafe.Pointer(prog), ptrSize-i-1)))
+ 			}
+ 			prog = (*byte)(add(unsafe.Pointer(prog), ptrSize))
+ 			var prog1 *byte
+ 			for i := uintptr(0); i < siz; i++ {
+ 				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace, sparse)
+ 			}
+ 			if *prog1 != insArrayEnd {
+ 				gothrow("unrollgcprog: array does not end with insArrayEnd")
+ 			}
+ 			prog = (*byte)(add(unsafe.Pointer(prog1), 1))
+ 
+ 		case insArrayEnd, insEnd:
+ 			*ppos = pos
+ 			return prog
+ 		}
+ 	}
+ }
+ 
+ // Unrolls GC program prog for data/bss, returns dense GC mask.
+ func unrollglobgcprog(prog *byte, size uintptr) bitvector {
+ 	masksize := round(round(size, ptrSize)/ptrSize*bitsPerPointer, 8) / 8
+ 	mask := (*[1 << 30]byte)(persistentalloc(masksize+1, 0, &memstats.gc_sys))
+ 	mask[masksize] = 0xa1
+ 	pos := uintptr(0)
+ 	prog = unrollgcprog1(&mask[0], prog, &pos, false, false)
+ 	if pos != size/ptrSize*bitsPerPointer {
+ 		print("unrollglobgcprog: bad program size, got ", pos, ", expect ", size/ptrSize*bitsPerPointer, "\n")
+ 		gothrow("unrollglobgcprog: bad program size")
+ 	}
+ 	if *prog != insEnd {
+ 		gothrow("unrollglobgcprog: program does not end with insEnd")
+ 	}
+ 	if mask[masksize] != 0xa1 {
+ 		gothrow("unrollglobgcprog: overflow")
+ 	}
+ 	return bitvector{int32(masksize * 8), &mask[0]}
+ }
+ 
+ func unrollgcproginplace_m(v unsafe.Pointer, typ *_type, size, size0 uintptr) {
+ 	pos := uintptr(0)
+ 	prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+ 	for pos != size0 {
+ 		unrollgcprog1((*byte)(v), prog, &pos, true, true)
+ 	}
+ 
+ 	// Mark first word as bitAllocated.
+ 	arena_start := mheap_.arena_start
+ 	off := (uintptr(v) - arena_start) / ptrSize
+ 	bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+ 	shift := (off % wordsPerBitmapByte) * gcBits
+ 	*bitp |= bitBoundary << shift
+ 
+ 	// Mark word after last as BitsDead.
+ 	if size0 < size {
+ 		off := (uintptr(v) + size0 - arena_start) / ptrSize
+ 		bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+ 		shift := (off % wordsPerBitmapByte) * gcBits
+ 		*bitp &= uint8(^(bitPtrMask << shift) | uintptr(bitsDead)<<(shift+2))
+ 	}
+ }
+ 
+ var unroll mutex
+ 
+ // Unrolls GC program in typ.gc[1] into typ.gc[0]
+ func unrollgcprog_m(typ *_type) {
+ 	lock(&unroll)
+ 	mask := (*byte)(unsafe.Pointer(uintptr(typ.gc[0])))
+ 	if *mask == 0 {
+ 		pos := uintptr(8) // skip the unroll flag
+ 		prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+ 		prog = unrollgcprog1(mask, prog, &pos, false, true)
+ 		if *prog != insEnd {
+ 			gothrow("unrollgcprog: program does not end with insEnd")
+ 		}
+ 		if typ.size/ptrSize%2 != 0 {
+ 			// repeat the program
+ 			prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+ 			unrollgcprog1(mask, prog, &pos, false, true)
+ 		}
+ 
+ 		// atomic way to say mask[0] = 1
+ 		atomicor8(mask, 1)
+ 	}
+ 	unlock(&unroll)
+ }
+ 
+ // mark the span of memory at v as having n blocks of the given size.
+ // if leftover is true, there is left over space at the end of the span.
+ func markspan(v unsafe.Pointer, size uintptr, n uintptr, leftover bool) {
+ 	if uintptr(v)+size*n > mheap_.arena_used || uintptr(v) < mheap_.arena_start {
+ 		gothrow("markspan: bad pointer")
+ 	}
+ 
+ 	// Find bits of the beginning of the span.
+ 	off := (uintptr(v) - uintptr(mheap_.arena_start)) / ptrSize
+ 	if off%wordsPerBitmapByte != 0 {
+ 		gothrow("markspan: unaligned length")
+ 	}
+ 	b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+ 
+ 	// Okay to use non-atomic ops here, because we control
+ 	// the entire span, and each bitmap byte has bits for only
+ 	// one span, so no other goroutines are changing these bitmap words.
+ 
+ 	if size == ptrSize {
+ 		// Possible only on 64-bits (minimal size class is 8 bytes).
+ 		// Set memory to 0x11.
+ 		if (bitBoundary|bitsDead)<<gcBits|bitBoundary|bitsDead != 0x11 {
+ 			gothrow("markspan: bad bits")
+ 		}
+ 		if n%(wordsPerBitmapByte*ptrSize) != 0 {
+ 			gothrow("markspan: unaligned length")
+ 		}
+ 		b = b - n/wordsPerBitmapByte + 1 // find first byte
+ 		if b%ptrSize != 0 {
+ 			gothrow("markspan: unaligned pointer")
+ 		}
+ 		for i := uintptr(0); i < n; i, b = i+wordsPerBitmapByte*ptrSize, b+ptrSize {
+ 			*(*uintptr)(unsafe.Pointer(b)) = uintptrMask & 0x1111111111111111 // bitBoundary | bitsDead, repeated
+ 		}
+ 		return
+ 	}
+ 
+ 	if leftover {
+ 		n++ // mark a boundary just past end of last block too
+ 	}
+ 	step := size / (ptrSize * wordsPerBitmapByte)
+ 	for i := uintptr(0); i < n; i, b = i+1, b-step {
+ 		*(*byte)(unsafe.Pointer(b)) = bitBoundary | bitsDead<<2
+ 	}
+ }
+ 
+ // unmark the span of memory at v of length n bytes.
+ func unmarkspan(v, n uintptr) {
+ 	if v+n > mheap_.arena_used || v < mheap_.arena_start {
+ 		gothrow("markspan: bad pointer")
+ 	}
+ 
+ 	off := (v - mheap_.arena_start) / ptrSize // word offset
+ 	if off%(ptrSize*wordsPerBitmapByte) != 0 {
+ 		gothrow("markspan: unaligned pointer")
+ 	}
+ 
+ 	b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+ 	n /= ptrSize
+ 	if n%(ptrSize*wordsPerBitmapByte) != 0 {
+ 		gothrow("unmarkspan: unaligned length")
+ 	}
+ 
+ 	// Okay to use non-atomic ops here, because we control
+ 	// the entire span, and each bitmap word has bits for only
+ 	// one span, so no other goroutines are changing these
+ 	// bitmap words.
+ 	n /= wordsPerBitmapByte
+ 	memclr(unsafe.Pointer(b-n+1), n)
+ }
+ 
+ func mHeap_MapBits(h *mheap) {
+ 	// Caller has added extra mappings to the arena.
+ 	// Add extra mappings of bitmap words as needed.
+ 	// We allocate extra bitmap pieces in chunks of bitmapChunk.
+ 	const bitmapChunk = 8192
+ 
+ 	n := (h.arena_used - h.arena_start) / (ptrSize * wordsPerBitmapByte)
+ 	n = round(n, bitmapChunk)
+ 	n = round(n, _PhysPageSize)
+ 	if h.bitmap_mapped >= n {
+ 		return
+ 	}
+ 
+ 	sysMap(unsafe.Pointer(h.arena_start-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
+ 	h.bitmap_mapped = n
+ }
+ 
+ func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
+ 	target := (*stkframe)(ctxt)
+ 	if frame.sp <= target.sp && target.sp < frame.varp {
+ 		*target = *frame
+ 		return false
+ 	}
+ 	return true
+ }
+ 
+ // Returns GC type info for object p for testing.
+ func getgcmask(p unsafe.Pointer, t *_type, mask **byte, len *uintptr) {
+ 	*mask = nil
+ 	*len = 0
+ 
+ 	// data
+ 	if uintptr(unsafe.Pointer(&data)) <= uintptr(p) && uintptr(p) < uintptr(unsafe.Pointer(&edata)) {
+ 		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+ 		*len = n / ptrSize
+ 		*mask = &make([]byte, *len)[0]
+ 		for i := uintptr(0); i < n; i += ptrSize {
+ 			off := (uintptr(p) + i - uintptr(unsafe.Pointer(&data))) / ptrSize
+ 			bits := (*(*byte)(add(unsafe.Pointer(gcdatamask.bytedata), off/pointersPerByte)) >> ((off % pointersPerByte) * bitsPerPointer)) & bitsMask
+ 			*(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+ 		}
+ 		return
+ 	}
+ 
+ 	// bss
+ 	if uintptr(unsafe.Pointer(&bss)) <= uintptr(p) && uintptr(p) < uintptr(unsafe.Pointer(&ebss)) {
+ 		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+ 		*len = n / ptrSize
+ 		*mask = &make([]byte, *len)[0]
+ 		for i := uintptr(0); i < n; i += ptrSize {
+ 			off := (uintptr(p) + i - uintptr(unsafe.Pointer(&bss))) / ptrSize
+ 			bits := (*(*byte)(add(unsafe.Pointer(gcbssmask.bytedata), off/pointersPerByte)) >> ((off % pointersPerByte) * bitsPerPointer)) & bitsMask
+ 			*(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+ 		}
+ 		return
+ 	}
+ 
+ 	// heap
+ 	var n uintptr
+ 	var base uintptr
+ 	if mlookup(uintptr(p), &base, &n, nil) != 0 {
+ 		*len = n / ptrSize
+ 		*mask = &make([]byte, *len)[0]
+ 		for i := uintptr(0); i < n; i += ptrSize {
+ 			off := (uintptr(base) + i - mheap_.arena_start) / ptrSize
+ 			b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+ 			shift := (off % wordsPerBitmapByte) * gcBits
+ 			bits := (*(*byte)(unsafe.Pointer(b)) >> (shift + 2)) & bitsMask
+ 			*(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+ 		}
+ 		return
+ 	}
+ 
+ 	// stack
+ 	var frame stkframe
+ 	frame.sp = uintptr(p)
+ 	_g_ := getg()
+ 	gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
+ 	if frame.fn != nil {
+ 		f := frame.fn
+ 		targetpc := frame.continpc
+ 		if targetpc == 0 {
+ 			return
+ 		}
+ 		if targetpc != f.entry {
+ 			targetpc--
+ 		}
+ 		pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+ 		if pcdata == -1 {
+ 			return
+ 		}
+ 		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+ 		if stkmap == nil || stkmap.n <= 0 {
+ 			return
+ 		}
+ 		bv := stackmapdata(stkmap, pcdata)
+ 		size := uintptr(bv.n) / bitsPerPointer * ptrSize
+ 		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+ 		*len = n / ptrSize
+ 		*mask = &make([]byte, *len)[0]
+ 		for i := uintptr(0); i < n; i += ptrSize {
+ 			off := (uintptr(p) + i - frame.varp + size) / ptrSize
+ 			bits := ((*(*byte)(add(unsafe.Pointer(bv.bytedata), off*bitsPerPointer/8))) >> ((off * bitsPerPointer) % 8)) & bitsMask
+ 			*(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+ 		}
+ 	}
+ }
+ 
+ func unixnanotime() int64 {
+ 	var now int64
+ 	gc_unixnanotime(&now)
+ 	return now
+ }
diff --cc src/runtime/mgc0.go
index dc4eec5196,6d4ae61c11..00e64c0fff
--- a/src/runtime/mgc0.go
+++ b/src/runtime/mgc0.go
@@@ -93,36 -86,6 +91,32 @@@ const 
  //go:nosplit
  func writebarrierptr(dst *uintptr, src uintptr) {
  	*dst = src
 +	writebarrierptr_nostore(dst, src)
 +}
 +
 +// Like writebarrierptr, but the store has already been applied.
 +// Do not reapply.
 +//go:nosplit
 +func writebarrierptr_nostore(dst *uintptr, src uintptr) {
 +	if getg() == nil { // very low-level startup
 +		return
 +	}
 +
 +	if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) {
- 		onM(func() { gothrow("bad pointer in write barrier") })
++		systemstack(func() { gothrow("bad pointer in write barrier") })
 +	}
 +
 +	mp := acquirem()
 +	if mp.inwb || mp.dying > 0 {
 +		releasem(mp)
 +		return
 +	}
 +	mp.inwb = true
- 	oldscalar0 := mp.scalararg[0]
- 	oldscalar1 := mp.scalararg[1]
- 	mp.scalararg[0] = uintptr(unsafe.Pointer(dst))
- 	mp.scalararg[1] = src
- 	onM_signalok(gcmarkwb_m)
- 	mp.scalararg[0] = oldscalar0
- 	mp.scalararg[1] = oldscalar1
++	systemstack(func() {
++		gcmarkwb_m(dst, src)
++	})
 +	mp.inwb = false
 +	releasem(mp)
  }
  
  //go:nosplit
diff --cc src/runtime/mgc0.h
index 519d7206e7,62726b4f0f..dd0c460246
--- a/src/runtime/mgc0.h
+++ b/src/runtime/mgc0.h
@@@ -2,81 -2,19 +2,21 @@@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
- // Garbage collector (GC)
+ // Used by cmd/gc.
  
  enum {
- 	// Four bits per word (see #defines below).
  	gcBits = 4,
- 	wordsPerBitmapByte = 8/gcBits,
- 
- 	// GC type info programs.
- 	// The programs allow to store type info required for GC in a compact form.
- 	// Most importantly arrays take O(1) space instead of O(n).
- 	// The program grammar is:
- 	//
- 	// Program = {Block} "insEnd"
- 	// Block = Data | Array
- 	// Data = "insData" DataSize DataBlock
- 	// DataSize = int // size of the DataBlock in bit pairs, 1 byte
- 	// DataBlock = binary // dense GC mask (2 bits per word) of size ]DataSize/4[ bytes
- 	// Array = "insArray" ArrayLen Block "insArrayEnd"
- 	// ArrayLen = int // length of the array, 8 bytes (4 bytes for 32-bit arch)
- 	//
- 	// Each instruction (insData, insArray, etc) is 1 byte.
- 	// For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
- 	// the program looks as:
- 	//
- 	// insData 3 (BitsMultiWord BitsSlice BitsScalar)
- 	//	insArray 20 insData 2 (BitsScalar BitsPointer) insArrayEnd insEnd
- 	//
- 	// Total size of the program is 17 bytes (13 bytes on 32-bits).
- 	// The corresponding GC mask would take 43 bytes (it would be repeated
- 	// because the type has odd number of words).
+ 	BitsPerPointer = 2,
+ 	BitsDead = 0,
+ 	BitsScalar = 1,
+ 	BitsPointer = 2,
+ 	BitsMask = 3,
+ 	PointersPerByte = 8/BitsPerPointer,
 -	MaxGCMask = 64,
  	insData = 1,
  	insArray,
  	insArrayEnd,
  	insEnd,
 +
- 	// Pointer map
- 	BitsPerPointer	= 2,
- 	BitsMask	= (1<<BitsPerPointer)-1,
- 	PointersPerByte	= 8/BitsPerPointer,
- 
- 	// If you change these, also change scanblock.
- 	// scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
- 	BitsDead	= 0,
- 	BitsScalar	= 1,                                // 01
- 	BitsPointer	= 2,                                // 10
- 	BitsCheckMarkXor = 1,                               // 10
- 	BitsScalarMarked = BitsScalar ^ BitsCheckMarkXor,   // 00
- 	BitsPointerMarked = BitsPointer ^ BitsCheckMarkXor, // 11
- 
- 	BitsMultiWord	= 3,
- 	// BitsMultiWord will be set for the first word of a multi-word item.
- 	// When it is set, one of the following will be set for the second word.
- 	// NOT USED ANYMORE: BitsString	= 0,
- 	// NOT USED ANYMORE: BitsSlice	= 1,
- 	BitsIface	= 2,
- 	BitsEface	= 3,
- 
 +	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
 +	MaxGCMask	= 65536, // TODO(rsc): change back to 64
  };
- 
- // Bits in per-word bitmap.
- // #defines because we shift the values beyond 32 bits.
- //
- // Each word in the bitmap describes wordsPerBitmapWord words
- // of heap memory.  There are 4 bitmap bits dedicated to each heap word,
- // so on a 64-bit system there is one bitmap word per 16 heap words.
- //
- // The bitmap starts at mheap.arena_start and extends *backward* from
- // there.  On a 64-bit system the off'th word in the arena is tracked by
- // the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
- // the only difference is that the divisor is 8.)
- enum {
- 	bitBoundary = 1, // boundary of an object
- 	bitMarked = 2, // marked object
- 	bitMask = bitBoundary | bitMarked,
- 	bitPtrMask = BitsMask<<2,
- };
diff --cc src/runtime/mgc1.go
index 0000000000,d1aab45546..04a5207e54
mode 000000,100644..100644
--- a/src/runtime/mgc1.go
+++ b/src/runtime/mgc1.go
@@@ -1,0 -1,77 +1,80 @@@
+ // Copyright 2012 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // Garbage collector (GC)
+ 
+ package runtime
+ 
+ const (
+ 	// Four bits per word (see #defines below).
+ 	gcBits             = 4
+ 	wordsPerBitmapByte = 8 / gcBits
+ )
+ 
+ const (
+ 	// GC type info programs.
+ 	// The programs allow to store type info required for GC in a compact form.
+ 	// Most importantly arrays take O(1) space instead of O(n).
+ 	// The program grammar is:
+ 	//
+ 	// Program = {Block} "insEnd"
+ 	// Block = Data | Array
+ 	// Data = "insData" DataSize DataBlock
+ 	// DataSize = int // size of the DataBlock in bit pairs, 1 byte
+ 	// DataBlock = binary // dense GC mask (2 bits per word) of size ]DataSize/4[ bytes
+ 	// Array = "insArray" ArrayLen Block "insArrayEnd"
+ 	// ArrayLen = int // length of the array, 8 bytes (4 bytes for 32-bit arch)
+ 	//
+ 	// Each instruction (insData, insArray, etc) is 1 byte.
+ 	// For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
+ 	// the program looks as:
+ 	//
+ 	// insData 3 (BitsPointer BitsScalar BitsScalar)
+ 	//	insArray 20 insData 2 (BitsScalar BitsPointer) insArrayEnd insEnd
+ 	//
+ 	// Total size of the program is 17 bytes (13 bytes on 32-bits).
+ 	// The corresponding GC mask would take 43 bytes (it would be repeated
+ 	// because the type has odd number of words).
+ 	insData = 1 + iota
+ 	insArray
+ 	insArrayEnd
+ 	insEnd
+ )
+ 
+ const (
+ 	// Pointer map
+ 	_BitsPerPointer  = 2
+ 	_BitsMask        = (1 << _BitsPerPointer) - 1
+ 	_PointersPerByte = 8 / _BitsPerPointer
+ 
+ 	// If you change these, also change scanblock.
+ 	// scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
 -	_BitsDead    = 0
 -	_BitsScalar  = 1
 -	_BitsPointer = 2
++	_BitsDead          = 0
++	_BitsScalar        = 1                                // 01
++	_BitsPointer       = 2                                // 10
++	_BitsCheckMarkXor  = 1                                // 10
++	_BitsScalarMarked  = _BitsScalar ^ _BitsCheckMarkXor  // 00
++	_BitsPointerMarked = _BitsPointer ^ _BitsCheckMarkXor // 11
+ 
+ 	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
 -	_MaxGCMask = 64
++	_MaxGCMask = 65536 // TODO(rsc): change back to 64
+ )
+ 
+ // Bits in per-word bitmap.
+ // #defines because we shift the values beyond 32 bits.
+ //
+ // Each word in the bitmap describes wordsPerBitmapWord words
+ // of heap memory.  There are 4 bitmap bits dedicated to each heap word,
+ // so on a 64-bit system there is one bitmap word per 16 heap words.
+ //
+ // The bitmap starts at mheap.arena_start and extends *backward* from
+ // there.  On a 64-bit system the off'th word in the arena is tracked by
+ // the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
+ // the only difference is that the divisor is 8.)
+ const (
+ 	bitBoundary = 1 // boundary of an object
+ 	bitMarked   = 2 // marked object
+ 	bitMask     = bitBoundary | bitMarked
+ 	bitPtrMask  = _BitsMask << 2
+ )
diff --cc src/runtime/os1_freebsd.go
index 0000000000,dd22b61d6b..2cacfbae61
mode 000000,100644..100644
--- a/src/runtime/os1_freebsd.go
+++ b/src/runtime/os1_freebsd.go
@@@ -1,0 -1,221 +1,221 @@@
+ // Copyright 2011 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ // From FreeBSD's <sys/sysctl.h>
+ const (
+ 	_CTL_HW  = 6
+ 	_HW_NCPU = 3
+ )
+ 
+ var sigset_none = sigset{}
+ var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
+ 
+ func getncpu() int32 {
+ 	mib := [2]uint32{_CTL_HW, _HW_NCPU}
+ 	out := uint32(0)
+ 	nout := unsafe.Sizeof(out)
+ 	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+ 	if ret >= 0 {
+ 		return int32(out)
+ 	}
+ 	return 1
+ }
+ 
+ // FreeBSD's umtx_op syscall is effectively the same as Linux's futex, and
+ // thus the code is largely similar. See Linux implementation
+ // and lock_futex.c for comments.
+ 
+ //go:nosplit
+ func futexsleep(addr *uint32, val uint32, ns int64) {
+ 	systemstack(func() {
+ 		futexsleep1(addr, val, ns)
+ 	})
+ }
+ 
+ func futexsleep1(addr *uint32, val uint32, ns int64) {
+ 	var tsp *timespec
+ 	if ns >= 0 {
+ 		var ts timespec
+ 		ts.tv_nsec = 0
 -		ts.set_sec(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec))))
++		ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
+ 		tsp = &ts
+ 	}
+ 	ret := sys_umtx_op(addr, _UMTX_OP_WAIT_UINT_PRIVATE, val, nil, tsp)
+ 	if ret >= 0 || ret == -_EINTR {
+ 		return
+ 	}
+ 	print("umtx_wait addr=", addr, " val=", val, " ret=", ret, "\n")
+ 	*(*int32)(unsafe.Pointer(uintptr(0x1005))) = 0x1005
+ }
+ 
+ //go:nosplit
+ func futexwakeup(addr *uint32, cnt uint32) {
+ 	ret := sys_umtx_op(addr, _UMTX_OP_WAKE_PRIVATE, cnt, nil, nil)
+ 	if ret >= 0 {
+ 		return
+ 	}
+ 
+ 	systemstack(func() {
+ 		print("umtx_wake_addr=", addr, " ret=", ret, "\n")
+ 	})
+ }
+ 
+ func thr_start()
+ 
+ func newosproc(mp *m, stk unsafe.Pointer) {
+ 	if false {
+ 		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " thr_start=", funcPC(thr_start), " id=", mp.id, "/", mp.tls[0], " ostk=", &mp, "\n")
+ 	}
+ 
+ 	// NOTE(rsc): This code is confused. stackbase is the top of the stack
+ 	// and is equal to stk. However, it's working, so I'm not changing it.
+ 	param := thrparam{
+ 		start_func: funcPC(thr_start),
+ 		arg:        unsafe.Pointer(mp),
+ 		stack_base: mp.g0.stack.hi,
+ 		stack_size: uintptr(stk) - mp.g0.stack.hi,
+ 		child_tid:  unsafe.Pointer(&mp.procid),
+ 		parent_tid: nil,
+ 		tls_base:   unsafe.Pointer(&mp.tls[0]),
+ 		tls_size:   unsafe.Sizeof(mp.tls),
+ 	}
+ 	mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
+ 
+ 	var oset sigset
+ 	sigprocmask(&sigset_all, &oset)
+ 	thr_new(&param, int32(unsafe.Sizeof(param)))
+ 	sigprocmask(&oset, nil)
+ }
+ 
+ func osinit() {
+ 	ncpu = getncpu()
+ }
+ 
+ var urandom_data [_HashRandomBytes]byte
+ var urandom_dev = []byte("/dev/random\x00")
+ 
+ //go:nosplit
+ func get_random_data(rnd *unsafe.Pointer, rnd_len *int32) {
+ 	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+ 	if read(fd, unsafe.Pointer(&urandom_data), _HashRandomBytes) == _HashRandomBytes {
+ 		*rnd = unsafe.Pointer(&urandom_data[0])
+ 		*rnd_len = _HashRandomBytes
+ 	} else {
+ 		*rnd = nil
+ 		*rnd_len = 0
+ 	}
+ 	close(fd)
+ }
+ 
+ func goenvs() {
+ 	goenvs_unix()
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+ func mpreinit(mp *m) {
+ 	mp.gsignal = malg(32 * 1024)
+ 	mp.gsignal.m = mp
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the new thread, can not allocate memory.
+ func minit() {
+ 	_g_ := getg()
+ 
+ 	// m.procid is a uint64, but thr_new writes a uint32 on 32-bit systems.
+ 	// Fix it up. (Only matters on big-endian, but be clean anyway.)
+ 	if ptrSize == 4 {
+ 		_g_.m.procid = uint64(*(*uint32)(unsafe.Pointer(&_g_.m.procid)))
+ 	}
+ 
+ 	// Initialize signal handling.
+ 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
+ 	sigprocmask(&sigset_none, nil)
+ }
+ 
+ // Called from dropm to undo the effect of an minit.
+ func unminit() {
+ 	signalstack(nil, 0)
+ }
+ 
+ func memlimit() uintptr {
+ 	/*
+ 		TODO: Convert to Go when something actually uses the result.
+ 		Rlimit rl;
+ 		extern byte runtimeÂ·text[], runtimeÂ·end[];
+ 		uintptr used;
+ 
+ 		if(runtimeÂ·getrlimit(RLIMIT_AS, &rl) != 0)
+ 			return 0;
+ 		if(rl.rlim_cur >= 0x7fffffff)
+ 			return 0;
+ 
+ 		// Estimate our VM footprint excluding the heap.
+ 		// Not an exact science: use size of binary plus
+ 		// some room for thread stacks.
+ 		used = runtimeÂ·end - runtimeÂ·text + (64<<20);
+ 		if(used >= rl.rlim_cur)
+ 			return 0;
+ 
+ 		// If there's not at least 16 MB left, we're probably
+ 		// not going to be able to do much.  Treat as no limit.
+ 		rl.rlim_cur -= used;
+ 		if(rl.rlim_cur < (16<<20))
+ 			return 0;
+ 
+ 		return rl.rlim_cur - used;
+ 	*/
+ 
+ 	return 0
+ }
+ 
+ func sigtramp()
+ 
+ type sigactiont struct {
+ 	sa_handler uintptr
+ 	sa_flags   int32
+ 	sa_mask    sigset
+ }
+ 
+ func setsig(i int32, fn uintptr, restart bool) {
+ 	var sa sigactiont
+ 	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+ 	if restart {
+ 		sa.sa_flags |= _SA_RESTART
+ 	}
+ 	sa.sa_mask = sigset_all
+ 	if fn == funcPC(sighandler) {
+ 		fn = funcPC(sigtramp)
+ 	}
+ 	sa.sa_handler = fn
+ 	sigaction(i, &sa, nil)
+ }
+ func getsig(i int32) uintptr {
+ 	var sa sigactiont
+ 	sigaction(i, nil, &sa)
+ 	if sa.sa_handler == funcPC(sigtramp) {
+ 		return funcPC(sighandler)
+ 	}
+ 	return sa.sa_handler
+ }
+ 
+ func signalstack(p *byte, n int32) {
+ 	var st stackt
+ 	st.ss_sp = uintptr(unsafe.Pointer(p))
+ 	st.ss_size = uintptr(n)
+ 	st.ss_flags = 0
+ 	if p == nil {
+ 		st.ss_flags = _SS_DISABLE
+ 	}
+ 	sigaltstack(&st, nil)
+ }
+ 
+ func unblocksignals() {
+ 	sigprocmask(&sigset_none, nil)
+ }
diff --cc src/runtime/os1_linux.go
index 0000000000,0d24c5edc9..67fa6391e1
mode 000000,100644..100644
--- a/src/runtime/os1_linux.go
+++ b/src/runtime/os1_linux.go
@@@ -1,0 -1,287 +1,287 @@@
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ var sigset_none sigset
+ var sigset_all sigset = sigset{^uint32(0), ^uint32(0)}
+ 
+ // Linux futex.
+ //
+ //	futexsleep(uint32 *addr, uint32 val)
+ //	futexwakeup(uint32 *addr)
+ //
+ // Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
+ // Futexwakeup wakes up threads sleeping on addr.
+ // Futexsleep is allowed to wake up spuriously.
+ 
+ const (
+ 	_FUTEX_WAIT = 0
+ 	_FUTEX_WAKE = 1
+ )
+ 
+ // Atomically,
+ //	if(*addr == val) sleep
+ // Might be woken up spuriously; that's allowed.
+ // Don't sleep longer than ns; ns < 0 means forever.
+ //go:nosplit
+ func futexsleep(addr *uint32, val uint32, ns int64) {
+ 	var ts timespec
+ 
+ 	// Some Linux kernels have a bug where futex of
+ 	// FUTEX_WAIT returns an internal error code
+ 	// as an errno.  Libpthread ignores the return value
+ 	// here, and so can we: as it says a few lines up,
+ 	// spurious wakeups are allowed.
+ 	if ns < 0 {
+ 		futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, nil, nil, 0)
+ 		return
+ 	}
+ 
+ 	// It's difficult to live within the no-split stack limits here.
+ 	// On ARM and 386, a 64-bit divide invokes a general software routine
+ 	// that needs more stack than we can afford. So we use timediv instead.
+ 	// But on real 64-bit systems, where words are larger but the stack limit
+ 	// is not, even timediv is too heavy, and we really need to use just an
+ 	// ordinary machine instruction.
+ 	if ptrSize == 8 {
 -		ts.set_sec(int32(ns / 1000000000))
++		ts.set_sec(ns / 1000000000)
+ 		ts.set_nsec(int32(ns % 1000000000))
+ 	} else {
+ 		ts.tv_nsec = 0
 -		ts.set_sec(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec))))
++		ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
+ 	}
+ 	futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, unsafe.Pointer(&ts), nil, 0)
+ }
+ 
+ // If any procs are sleeping on addr, wake up at most cnt.
+ //go:nosplit
+ func futexwakeup(addr *uint32, cnt uint32) {
+ 	ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE, cnt, nil, nil, 0)
+ 	if ret >= 0 {
+ 		return
+ 	}
+ 
+ 	// I don't know that futex wakeup can return
+ 	// EAGAIN or EINTR, but if it does, it would be
+ 	// safe to loop and call futex again.
+ 	systemstack(func() {
+ 		print("futexwakeup addr=", addr, " returned ", ret, "\n")
+ 	})
+ 
+ 	*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
+ }
+ 
+ func getproccount() int32 {
+ 	var buf [16]uintptr
+ 	r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
+ 	n := int32(0)
+ 	for _, v := range buf[:r/ptrSize] {
+ 		for i := 0; i < 64; i++ {
+ 			n += int32(v & 1)
+ 			v >>= 1
+ 		}
+ 	}
+ 	if n == 0 {
+ 		n = 1
+ 	}
+ 	return n
+ }
+ 
+ // Clone, the Linux rfork.
+ const (
+ 	_CLONE_VM             = 0x100
+ 	_CLONE_FS             = 0x200
+ 	_CLONE_FILES          = 0x400
+ 	_CLONE_SIGHAND        = 0x800
+ 	_CLONE_PTRACE         = 0x2000
+ 	_CLONE_VFORK          = 0x4000
+ 	_CLONE_PARENT         = 0x8000
+ 	_CLONE_THREAD         = 0x10000
+ 	_CLONE_NEWNS          = 0x20000
+ 	_CLONE_SYSVSEM        = 0x40000
+ 	_CLONE_SETTLS         = 0x80000
+ 	_CLONE_PARENT_SETTID  = 0x100000
+ 	_CLONE_CHILD_CLEARTID = 0x200000
+ 	_CLONE_UNTRACED       = 0x800000
+ 	_CLONE_CHILD_SETTID   = 0x1000000
+ 	_CLONE_STOPPED        = 0x2000000
+ 	_CLONE_NEWUTS         = 0x4000000
+ 	_CLONE_NEWIPC         = 0x8000000
+ )
+ 
+ func newosproc(mp *m, stk unsafe.Pointer) {
+ 	/*
+ 	 * note: strace gets confused if we use CLONE_PTRACE here.
+ 	 */
+ 	var flags int32 = _CLONE_VM | /* share memory */
+ 		_CLONE_FS | /* share cwd, etc */
+ 		_CLONE_FILES | /* share fd table */
+ 		_CLONE_SIGHAND | /* share sig handler table */
+ 		_CLONE_THREAD /* revisit - okay for now */
+ 
+ 	mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
+ 	if false {
+ 		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", funcPC(clone), " id=", mp.id, "/", mp.tls[0], " ostk=", &mp, "\n")
+ 	}
+ 
+ 	// Disable signals during clone, so that the new thread starts
+ 	// with signals disabled.  It will enable them in minit.
+ 	var oset sigset
+ 	rtsigprocmask(_SIG_SETMASK, &sigset_all, &oset, int32(unsafe.Sizeof(oset)))
+ 	ret := clone(flags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(funcPC(mstart)))
+ 	rtsigprocmask(_SIG_SETMASK, &oset, nil, int32(unsafe.Sizeof(oset)))
+ 
+ 	if ret < 0 {
+ 		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
+ 		gothrow("newosproc")
+ 	}
+ }
+ 
+ func osinit() {
+ 	ncpu = getproccount()
+ }
+ 
+ // Random bytes initialized at startup.  These come
+ // from the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.c).
+ // byte*	runtimeÂ·startup_random_data;
+ // uint32	runtimeÂ·startup_random_data_len;
+ 
+ var urandom_data [_HashRandomBytes]byte
+ var urandom_dev = []byte("/dev/random\x00")
+ 
+ //go:nosplit
+ func get_random_data(rnd *unsafe.Pointer, rnd_len *int32) {
+ 	if startup_random_data != nil {
+ 		*rnd = unsafe.Pointer(startup_random_data)
+ 		*rnd_len = int32(startup_random_data_len)
+ 		return
+ 	}
+ 	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+ 	if read(fd, unsafe.Pointer(&urandom_data), _HashRandomBytes) == _HashRandomBytes {
+ 		*rnd = unsafe.Pointer(&urandom_data[0])
+ 		*rnd_len = _HashRandomBytes
+ 	} else {
+ 		*rnd = nil
+ 		*rnd_len = 0
+ 	}
+ 	close(fd)
+ }
+ 
+ func goenvs() {
+ 	goenvs_unix()
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+ func mpreinit(mp *m) {
+ 	mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
+ 	mp.gsignal.m = mp
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the new thread, can not allocate memory.
+ func minit() {
+ 	// Initialize signal handling.
+ 	_g_ := getg()
+ 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
+ 	rtsigprocmask(_SIG_SETMASK, &sigset_none, nil, int32(unsafe.Sizeof(sigset_none)))
+ }
+ 
+ // Called from dropm to undo the effect of an minit.
+ func unminit() {
+ 	signalstack(nil, 0)
+ }
+ 
+ func memlimit() uintptr {
+ 	/*
+ 		TODO: Convert to Go when something actually uses the result.
+ 
+ 		Rlimit rl;
+ 		extern byte runtimeÂ·text[], runtimeÂ·end[];
+ 		uintptr used;
+ 
+ 		if(runtimeÂ·getrlimit(RLIMIT_AS, &rl) != 0)
+ 			return 0;
+ 		if(rl.rlim_cur >= 0x7fffffff)
+ 			return 0;
+ 
+ 		// Estimate our VM footprint excluding the heap.
+ 		// Not an exact science: use size of binary plus
+ 		// some room for thread stacks.
+ 		used = runtimeÂ·end - runtimeÂ·text + (64<<20);
+ 		if(used >= rl.rlim_cur)
+ 			return 0;
+ 
+ 		// If there's not at least 16 MB left, we're probably
+ 		// not going to be able to do much.  Treat as no limit.
+ 		rl.rlim_cur -= used;
+ 		if(rl.rlim_cur < (16<<20))
+ 			return 0;
+ 
+ 		return rl.rlim_cur - used;
+ 	*/
+ 
+ 	return 0
+ }
+ 
+ //#ifdef GOARCH_386
+ //#define sa_handler k_sa_handler
+ //#endif
+ 
+ func sigreturn()
+ func sigtramp()
+ 
+ func setsig(i int32, fn uintptr, restart bool) {
+ 	var sa sigactiont
+ 	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
+ 	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER
+ 	if restart {
+ 		sa.sa_flags |= _SA_RESTART
+ 	}
+ 	sa.sa_mask = ^uint64(0)
+ 	// Although Linux manpage says "sa_restorer element is obsolete and
+ 	// should not be used". x86_64 kernel requires it. Only use it on
+ 	// x86.
+ 	if GOARCH == "386" || GOARCH == "amd64" {
+ 		sa.sa_restorer = funcPC(sigreturn)
+ 	}
+ 	if fn == funcPC(sighandler) {
+ 		fn = funcPC(sigtramp)
+ 	}
+ 	sa.sa_handler = fn
+ 	if rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask)) != 0 {
+ 		gothrow("rt_sigaction failure")
+ 	}
+ }
+ 
+ func getsig(i int32) uintptr {
+ 	var sa sigactiont
+ 
+ 	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
+ 	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
+ 		gothrow("rt_sigaction read failure")
+ 	}
+ 	if sa.sa_handler == funcPC(sigtramp) {
+ 		return funcPC(sighandler)
+ 	}
+ 	return sa.sa_handler
+ }
+ 
+ func signalstack(p *byte, n int32) {
+ 	var st sigaltstackt
+ 	st.ss_sp = p
+ 	st.ss_size = uintptr(n)
+ 	st.ss_flags = 0
+ 	if p == nil {
+ 		st.ss_flags = _SS_DISABLE
+ 	}
+ 	sigaltstack(&st, nil)
+ }
+ 
+ func unblocksignals() {
+ 	rtsigprocmask(_SIG_SETMASK, &sigset_none, nil, int32(unsafe.Sizeof(sigset_none)))
+ }
diff --cc src/runtime/os1_openbsd.go
index 0000000000,5c6ea74121..d5ffe10a81
mode 000000,100644..100644
--- a/src/runtime/os1_openbsd.go
+++ b/src/runtime/os1_openbsd.go
@@@ -1,0 -1,235 +1,235 @@@
+ // Copyright 2011 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ const (
+ 	ESRCH       = 3
+ 	EAGAIN      = 35
+ 	EWOULDBLOCK = EAGAIN
+ 	ENOTSUP     = 91
+ 
+ 	// From OpenBSD's sys/time.h
+ 	CLOCK_REALTIME  = 0
+ 	CLOCK_VIRTUAL   = 1
+ 	CLOCK_PROF      = 2
+ 	CLOCK_MONOTONIC = 3
+ )
+ 
+ var sigset_none = uint32(0)
+ var sigset_all = ^sigset_none
+ 
+ // From OpenBSD's <sys/sysctl.h>
+ const (
+ 	CTL_HW  = 6
+ 	HW_NCPU = 3
+ )
+ 
+ func getncpu() int32 {
+ 	mib := [2]uint32{CTL_HW, HW_NCPU}
+ 	out := uint32(0)
+ 	nout := unsafe.Sizeof(out)
+ 
+ 	// Fetch hw.ncpu via sysctl.
+ 	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+ 	if ret >= 0 {
+ 		return int32(out)
+ 	}
+ 	return 1
+ }
+ 
+ //go:nosplit
+ func semacreate() uintptr {
+ 	return 1
+ }
+ 
+ //go:nosplit
+ func semasleep(ns int64) int32 {
+ 	_g_ := getg()
+ 
+ 	// Compute sleep deadline.
+ 	var tsp *timespec
+ 	if ns >= 0 {
+ 		var ts timespec
+ 		var nsec int32
+ 		ns += nanotime()
 -		ts.set_sec(timediv(ns, 1000000000, &nsec))
++		ts.set_sec(int64(timediv(ns, 1000000000, &nsec)))
+ 		ts.set_nsec(nsec)
+ 		tsp = &ts
+ 	}
+ 
+ 	for {
+ 		// spin-mutex lock
+ 		for {
+ 			if xchg(&_g_.m.waitsemalock, 1) == 0 {
+ 				break
+ 			}
+ 			osyield()
+ 		}
+ 
+ 		if _g_.m.waitsemacount != 0 {
+ 			// semaphore is available.
+ 			_g_.m.waitsemacount--
+ 			// spin-mutex unlock
+ 			atomicstore(&_g_.m.waitsemalock, 0)
+ 			return 0 // semaphore acquired
+ 		}
+ 
+ 		// sleep until semaphore != 0 or timeout.
+ 		// thrsleep unlocks m.waitsemalock.
+ 		ret := thrsleep((uintptr)(unsafe.Pointer(&_g_.m.waitsemacount)), CLOCK_MONOTONIC, tsp, (uintptr)(unsafe.Pointer(&_g_.m.waitsemalock)), (*int32)(unsafe.Pointer(&_g_.m.waitsemacount)))
+ 		if ret == EWOULDBLOCK {
+ 			return -1
+ 		}
+ 	}
+ }
+ 
+ //go:nosplit
+ func semawakeup(mp *m) {
+ 	// spin-mutex lock
+ 	for {
+ 		if xchg(&mp.waitsemalock, 1) == 0 {
+ 			break
+ 		}
+ 		osyield()
+ 	}
+ 	mp.waitsemacount++
+ 	ret := thrwakeup(uintptr(unsafe.Pointer(&mp.waitsemacount)), 1)
+ 	if ret != 0 && ret != ESRCH {
+ 		// semawakeup can be called on signal stack.
+ 		systemstack(func() {
+ 			print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n")
+ 		})
+ 	}
+ 	// spin-mutex unlock
+ 	atomicstore(&mp.waitsemalock, 0)
+ }
+ 
+ func newosproc(mp *m, stk unsafe.Pointer) {
+ 	if false {
+ 		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, "/", int32(mp.tls[0]), " ostk=", &mp, "\n")
+ 	}
+ 
+ 	mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
+ 
+ 	param := tforkt{
+ 		tf_tcb:   unsafe.Pointer(&mp.tls[0]),
+ 		tf_tid:   (*int32)(unsafe.Pointer(&mp.procid)),
+ 		tf_stack: uintptr(stk),
+ 	}
+ 
+ 	oset := sigprocmask(_SIG_SETMASK, sigset_all)
+ 	ret := tfork(&param, unsafe.Sizeof(param), mp, mp.g0, funcPC(mstart))
+ 	sigprocmask(_SIG_SETMASK, oset)
+ 
+ 	if ret < 0 {
+ 		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
+ 		if ret == -ENOTSUP {
+ 			print("runtime: is kern.rthreads disabled?\n")
+ 		}
+ 		gothrow("runtime.newosproc")
+ 	}
+ }
+ 
+ func osinit() {
+ 	ncpu = getncpu()
+ }
+ 
+ var urandom_data [_HashRandomBytes]byte
+ var urandom_dev = []byte("/dev/urandom\x00")
+ 
+ //go:nosplit
+ func get_random_data(rnd *unsafe.Pointer, rnd_len *int32) {
+ 	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+ 	if read(fd, unsafe.Pointer(&urandom_data), _HashRandomBytes) == _HashRandomBytes {
+ 		*rnd = unsafe.Pointer(&urandom_data[0])
+ 		*rnd_len = _HashRandomBytes
+ 	} else {
+ 		*rnd = nil
+ 		*rnd_len = 0
+ 	}
+ 	close(fd)
+ }
+ 
+ func goenvs() {
+ 	goenvs_unix()
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+ func mpreinit(mp *m) {
+ 	mp.gsignal = malg(32 * 1024)
+ 	mp.gsignal.m = mp
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the new thread, can not allocate memory.
+ func minit() {
+ 	_g_ := getg()
+ 
+ 	// m.procid is a uint64, but tfork writes an int32. Fix it up.
+ 	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
+ 
+ 	// Initialize signal handling
+ 	signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
+ 	sigprocmask(_SIG_SETMASK, sigset_none)
+ }
+ 
+ // Called from dropm to undo the effect of an minit.
+ func unminit() {
+ 	signalstack(nil, 0)
+ }
+ 
+ func memlimit() uintptr {
+ 	return 0
+ }
+ 
+ func sigtramp()
+ 
+ type sigactiont struct {
+ 	sa_sigaction uintptr
+ 	sa_mask      uint32
+ 	sa_flags     int32
+ }
+ 
+ func setsig(i int32, fn uintptr, restart bool) {
+ 	var sa sigactiont
+ 	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+ 	if restart {
+ 		sa.sa_flags |= _SA_RESTART
+ 	}
+ 	sa.sa_mask = sigset_all
+ 	if fn == funcPC(sighandler) {
+ 		fn = funcPC(sigtramp)
+ 	}
+ 	sa.sa_sigaction = fn
+ 	sigaction(i, &sa, nil)
+ }
+ 
+ func getsig(i int32) uintptr {
+ 	var sa sigactiont
+ 	sigaction(i, nil, &sa)
+ 	if sa.sa_sigaction == funcPC(sigtramp) {
+ 		return funcPC(sighandler)
+ 	}
+ 	return sa.sa_sigaction
+ }
+ 
+ func signalstack(p *byte, n int32) {
+ 	var st stackt
+ 
+ 	st.ss_sp = uintptr(unsafe.Pointer(p))
+ 	st.ss_size = uintptr(n)
+ 	st.ss_flags = 0
+ 	if p == nil {
+ 		st.ss_flags = _SS_DISABLE
+ 	}
+ 	sigaltstack(&st, nil)
+ }
+ 
+ func unblocksignals() {
+ 	sigprocmask(_SIG_SETMASK, sigset_none)
+ }
diff --cc src/runtime/os_linux_386.go
index 0000000000,c4f95804ac..adcd5a1c4e
mode 000000,100644..100644
--- a/src/runtime/os_linux_386.go
+++ b/src/runtime/os_linux_386.go
@@@ -1,0 -1,37 +1,36 @@@
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ const (
+ 	_AT_NULL    = 0
+ 	_AT_RANDOM  = 25
+ 	_AT_SYSINFO = 32
+ )
+ 
+ var _vdso uint32
+ 
 -//go:nosplit
 -func linux_setup_vdso(argc int32, argv **byte) {
++func sysargs(argc int32, argv **byte) {
+ 	// skip over argv, envv to get to auxv
+ 	n := argc + 1
+ 	for argv_index(argv, n) != nil {
+ 		n++
+ 	}
+ 	n++
+ 	auxv := (*[1 << 28]uint32)(add(unsafe.Pointer(argv), uintptr(n)*ptrSize))
+ 
+ 	for i := 0; auxv[i] != _AT_NULL; i += 2 {
+ 		switch auxv[i] {
+ 		case _AT_SYSINFO:
+ 			_vdso = auxv[i+1]
+ 
+ 		case _AT_RANDOM:
+ 			startup_random_data = (*byte)(unsafe.Pointer(uintptr(auxv[i+1])))
+ 			startup_random_data_len = 16
+ 		}
+ 	}
+ }
diff --cc src/runtime/proc1.go
index 0000000000,81b211d0d3..8c941dd35d
mode 000000,100644..100644
--- a/src/runtime/proc1.go
+++ b/src/runtime/proc1.go
@@@ -1,0 -1,3170 +1,3186 @@@
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ var (
+ 	m0 m
+ 	g0 g
+ )
+ 
+ // Goroutine scheduler
+ // The scheduler's job is to distribute ready-to-run goroutines over worker threads.
+ //
+ // The main concepts are:
+ // G - goroutine.
+ // M - worker thread, or machine.
+ // P - processor, a resource that is required to execute Go code.
+ //     M must have an associated P to execute Go code, however it can be
+ //     blocked or in a syscall w/o an associated P.
+ //
+ // Design doc at http://golang.org/s/go11sched.
+ 
+ const (
+ 	// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
+ 	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+ 	_GoidCacheBatch = 16
+ )
+ 
+ /*
+ SchedT	sched;
+ int32	gomaxprocs;
+ uint32	needextram;
+ bool	iscgo;
+ M	m0;
+ G	g0;	// idle goroutine for m0
+ G*	lastg;
+ M*	allm;
+ M*	extram;
+ P*	allp[MaxGomaxprocs+1];
+ int8*	goos;
+ int32	ncpu;
+ int32	newprocs;
+ 
+ Mutex allglock;	// the following vars are protected by this lock or by stoptheworld
+ G**	allg;
+ Slice	allgs;
+ uintptr allglen;
+ ForceGCState	forcegc;
+ 
+ void mstart(void);
+ static void runqput(P*, G*);
+ static G* runqget(P*);
+ static bool runqputslow(P*, G*, uint32, uint32);
+ static G* runqsteal(P*, P*);
+ static void mput(M*);
+ static M* mget(void);
+ static void mcommoninit(M*);
+ static void schedule(void);
+ static void procresize(int32);
+ static void acquirep(P*);
+ static P* releasep(void);
+ static void newm(void(*)(void), P*);
+ static void stopm(void);
+ static void startm(P*, bool);
+ static void handoffp(P*);
+ static void wakep(void);
+ static void stoplockedm(void);
+ static void startlockedm(G*);
+ static void sysmon(void);
+ static uint32 retake(int64);
+ static void incidlelocked(int32);
+ static void checkdead(void);
+ static void exitsyscall0(G*);
+ void park_m(G*);
+ static void goexit0(G*);
+ static void gfput(P*, G*);
+ static G* gfget(P*);
+ static void gfpurge(P*);
+ static void globrunqput(G*);
+ static void globrunqputbatch(G*, G*, int32);
+ static G* globrunqget(P*, int32);
+ static P* pidleget(void);
+ static void pidleput(P*);
+ static void injectglist(G*);
+ static bool preemptall(void);
+ static bool preemptone(P*);
+ static bool exitsyscallfast(void);
+ static bool haveexperiment(int8*);
+ void allgadd(G*);
+ static void dropg(void);
+ 
+ extern String buildVersion;
+ */
+ 
+ // The bootstrap sequence is:
+ //
+ //	call osinit
+ //	call schedinit
+ //	make & queue new G
+ //	call runtimeÂ·mstart
+ //
+ // The new G calls runtimeÂ·main.
+ func schedinit() {
+ 	// raceinit must be the first call to race detector.
+ 	// In particular, it must be done before mallocinit below calls racemapshadow.
+ 	_g_ := getg()
+ 	if raceenabled {
+ 		_g_.racectx = raceinit()
+ 	}
+ 
+ 	sched.maxmcount = 10000
+ 
+ 	tracebackinit()
+ 	symtabinit()
+ 	stackinit()
+ 	mallocinit()
+ 	mcommoninit(_g_.m)
+ 
+ 	goargs()
+ 	goenvs()
+ 	parsedebugvars()
+ 	gcinit()
+ 
+ 	sched.lastpoll = uint64(nanotime())
+ 	procs := 1
+ 	if n := goatoi(gogetenv("GOMAXPROCS")); n > 0 {
+ 		if n > _MaxGomaxprocs {
+ 			n = _MaxGomaxprocs
+ 		}
+ 		procs = n
+ 	}
+ 	procresize(int32(procs))
+ 
+ 	if buildVersion == "" {
+ 		// Condition should never trigger.  This code just serves
+ 		// to ensure runtimeÂ·buildVersion is kept in the resulting binary.
+ 		buildVersion = "unknown"
+ 	}
+ }
+ 
+ func newsysmon() {
+ 	_newm(sysmon, nil)
+ }
+ 
+ func dumpgstatus(gp *g) {
+ 	_g_ := getg()
+ 	print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+ 	print("runtime:  g:  g=", _g_, ", goid=", _g_.goid, ",  g->atomicstatus=", readgstatus(_g_), "\n")
+ }
+ 
+ func checkmcount() {
+ 	// sched lock is held
+ 	if sched.mcount > sched.maxmcount {
+ 		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
+ 		gothrow("thread exhaustion")
+ 	}
+ }
+ 
+ func mcommoninit(mp *m) {
+ 	_g_ := getg()
+ 
+ 	// g0 stack won't make sense for user (and is not necessary unwindable).
+ 	if _g_ != _g_.m.g0 {
+ 		callers(1, &mp.createstack[0], len(mp.createstack))
+ 	}
+ 
+ 	mp.fastrand = 0x49f6428a + uint32(mp.id) + uint32(cputicks())
+ 	if mp.fastrand == 0 {
+ 		mp.fastrand = 0x49f6428a
+ 	}
+ 
+ 	lock(&sched.lock)
+ 	mp.id = sched.mcount
+ 	sched.mcount++
+ 	checkmcount()
+ 	mpreinit(mp)
+ 	if mp.gsignal != nil {
+ 		mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
+ 	}
+ 
+ 	// Add to allm so garbage collector doesn't free g->m
+ 	// when it is just in a register or thread-local storage.
+ 	mp.alllink = allm
+ 
+ 	// NumCgoCall() iterates over allm w/o schedlock,
+ 	// so we need to publish it safely.
+ 	atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
+ 	unlock(&sched.lock)
+ }
+ 
+ // Mark gp ready to run.
+ func ready(gp *g) {
+ 	status := readgstatus(gp)
+ 
+ 	// Mark runnable.
+ 	_g_ := getg()
+ 	_g_.m.locks++ // disable preemption because it can be holding p in a local var
+ 	if status&^_Gscan != _Gwaiting {
+ 		dumpgstatus(gp)
+ 		gothrow("bad g->status in ready")
+ 	}
+ 
+ 	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+ 	casgstatus(gp, _Gwaiting, _Grunnable)
+ 	runqput(_g_.m.p, gp)
+ 	if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 { // TODO: fast atomic
+ 		wakep()
+ 	}
+ 	_g_.m.locks--
+ 	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+ 		_g_.stackguard0 = stackPreempt
+ 	}
+ }
+ 
+ func gcprocs() int32 {
+ 	// Figure out how many CPUs to use during GC.
+ 	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
+ 	lock(&sched.lock)
+ 	n := gomaxprocs
+ 	if n > ncpu {
+ 		n = ncpu
+ 	}
+ 	if n > _MaxGcproc {
+ 		n = _MaxGcproc
+ 	}
+ 	if n > sched.nmidle+1 { // one M is currently running
+ 		n = sched.nmidle + 1
+ 	}
+ 	unlock(&sched.lock)
+ 	return n
+ }
+ 
+ func needaddgcproc() bool {
+ 	lock(&sched.lock)
+ 	n := gomaxprocs
+ 	if n > ncpu {
+ 		n = ncpu
+ 	}
+ 	if n > _MaxGcproc {
+ 		n = _MaxGcproc
+ 	}
+ 	n -= sched.nmidle + 1 // one M is currently running
+ 	unlock(&sched.lock)
+ 	return n > 0
+ }
+ 
+ func helpgc(nproc int32) {
+ 	_g_ := getg()
+ 	lock(&sched.lock)
+ 	pos := 0
+ 	for n := int32(1); n < nproc; n++ { // one M is currently running
+ 		if allp[pos].mcache == _g_.m.mcache {
+ 			pos++
+ 		}
+ 		mp := mget()
+ 		if mp == nil {
+ 			gothrow("gcprocs inconsistency")
+ 		}
+ 		mp.helpgc = n
+ 		mp.mcache = allp[pos].mcache
+ 		pos++
+ 		notewakeup(&mp.park)
+ 	}
+ 	unlock(&sched.lock)
+ }
+ 
+ // Similar to stoptheworld but best-effort and can be called several times.
+ // There is no reverse operation, used during crashing.
+ // This function must not lock any mutexes.
+ func freezetheworld() {
+ 	if gomaxprocs == 1 {
+ 		return
+ 	}
+ 	// stopwait and preemption requests can be lost
+ 	// due to races with concurrently executing threads,
+ 	// so try several times
+ 	for i := 0; i < 5; i++ {
+ 		// this should tell the scheduler to not start any new goroutines
+ 		sched.stopwait = 0x7fffffff
+ 		atomicstore(&sched.gcwaiting, 1)
+ 		// this should stop running goroutines
+ 		if !preemptall() {
+ 			break // no running goroutines
+ 		}
+ 		usleep(1000)
+ 	}
+ 	// to be sure
+ 	usleep(1000)
+ 	preemptall()
+ 	usleep(1000)
+ }
+ 
+ func isscanstatus(status uint32) bool {
+ 	if status == _Gscan {
+ 		gothrow("isscanstatus: Bad status Gscan")
+ 	}
+ 	return status&_Gscan == _Gscan
+ }
+ 
+ // All reads and writes of g's status go through readgstatus, casgstatus
+ // castogscanstatus, casfrom_Gscanstatus.
+ //go:nosplit
+ func readgstatus(gp *g) uint32 {
+ 	return atomicload(&gp.atomicstatus)
+ }
+ 
+ // The Gscanstatuses are acting like locks and this releases them.
+ // If it proves to be a performance hit we should be able to make these
+ // simple atomic stores but for now we are going to throw if
+ // we see an inconsistent state.
+ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
+ 	success := false
+ 
+ 	// Check that transition is valid.
+ 	switch oldval {
+ 	case _Gscanrunnable,
+ 		_Gscanwaiting,
+ 		_Gscanrunning,
+ 		_Gscansyscall:
+ 		if newval == oldval&^_Gscan {
+ 			success = cas(&gp.atomicstatus, oldval, newval)
+ 		}
+ 	case _Gscanenqueue:
+ 		if newval == _Gwaiting {
+ 			success = cas(&gp.atomicstatus, oldval, newval)
+ 		}
+ 	}
+ 	if !success {
+ 		print("runtime: casfrom_Gscanstatus failed gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
+ 		dumpgstatus(gp)
+ 		gothrow("casfrom_Gscanstatus: gp->status is not in scan state")
+ 	}
+ }
+ 
+ // This will return false if the gp is not in the expected status and the cas fails.
+ // This acts like a lock acquire while the casfromgstatus acts like a lock release.
+ func castogscanstatus(gp *g, oldval, newval uint32) bool {
+ 	switch oldval {
+ 	case _Grunnable,
+ 		_Gwaiting,
+ 		_Gsyscall:
+ 		if newval == oldval|_Gscan {
+ 			return cas(&gp.atomicstatus, oldval, newval)
+ 		}
+ 	case _Grunning:
+ 		if newval == _Gscanrunning || newval == _Gscanenqueue {
+ 			return cas(&gp.atomicstatus, oldval, newval)
+ 		}
+ 	}
+ 	print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
+ 	gothrow("castogscanstatus")
+ 	panic("not reached")
+ }
+ 
+ // If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
+ // and casfrom_Gscanstatus instead.
+ // casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
+ // put it in the Gscan state is finished.
+ //go:nosplit
+ func casgstatus(gp *g, oldval, newval uint32) {
+ 	if (oldval&_Gscan != 0) || (newval&_Gscan != 0) || oldval == newval {
+ 		systemstack(func() {
+ 			print("casgstatus: oldval=", hex(oldval), " newval=", hex(newval), "\n")
+ 			gothrow("casgstatus: bad incoming values")
+ 		})
+ 	}
+ 
+ 	// loop if gp->atomicstatus is in a scan state giving
+ 	// GC time to finish and change the state to oldval.
+ 	for !cas(&gp.atomicstatus, oldval, newval) {
 -		// Help GC if needed.
 -		if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
 -			gp.preemptscan = false
 -			systemstack(func() {
 -				gcphasework(gp)
 -			})
 -		}
+ 	}
+ }
+ 
+ // stopg ensures that gp is stopped at a GC safe point where its stack can be scanned
+ // or in the context of a moving collector the pointers can be flipped from pointing
+ // to old object to pointing to new objects.
+ // If stopg returns true, the caller knows gp is at a GC safe point and will remain there until
+ // the caller calls restartg.
+ // If stopg returns false, the caller is not responsible for calling restartg. This can happen
+ // if another thread, either the gp itself or another GC thread is taking the responsibility
+ // to do the GC work related to this thread.
+ func stopg(gp *g) bool {
+ 	for {
+ 		if gp.gcworkdone {
+ 			return false
+ 		}
+ 
+ 		switch s := readgstatus(gp); s {
+ 		default:
+ 			dumpgstatus(gp)
+ 			gothrow("stopg: gp->atomicstatus is not valid")
+ 
+ 		case _Gdead:
+ 			return false
+ 
+ 		case _Gcopystack:
+ 			// Loop until a new stack is in place.
+ 
+ 		case _Grunnable,
+ 			_Gsyscall,
+ 			_Gwaiting:
+ 			// Claim goroutine by setting scan bit.
+ 			if !castogscanstatus(gp, s, s|_Gscan) {
+ 				break
+ 			}
+ 			// In scan state, do work.
+ 			gcphasework(gp)
+ 			return true
+ 
+ 		case _Gscanrunnable,
+ 			_Gscanwaiting,
+ 			_Gscansyscall:
+ 			// Goroutine already claimed by another GC helper.
+ 			return false
+ 
+ 		case _Grunning:
++			if gcphase == _GCscan {
++				// Running routines not scanned during
++				// GCscan phase, we only scan non-running routines.
++				gp.gcworkdone = true
++				return false
++			}
++
+ 			// Claim goroutine, so we aren't racing with a status
+ 			// transition away from Grunning.
+ 			if !castogscanstatus(gp, _Grunning, _Gscanrunning) {
+ 				break
+ 			}
+ 
+ 			// Mark gp for preemption.
+ 			if !gp.gcworkdone {
+ 				gp.preemptscan = true
+ 				gp.preempt = true
+ 				gp.stackguard0 = stackPreempt
+ 			}
+ 
+ 			// Unclaim.
+ 			casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+ 			return false
+ 		}
+ 	}
+ }
+ 
+ // The GC requests that this routine be moved from a scanmumble state to a mumble state.
+ func restartg(gp *g) {
+ 	s := readgstatus(gp)
+ 	switch s {
+ 	default:
+ 		dumpgstatus(gp)
+ 		gothrow("restartg: unexpected status")
+ 
+ 	case _Gdead:
+ 		// ok
+ 
+ 	case _Gscanrunnable,
+ 		_Gscanwaiting,
+ 		_Gscansyscall:
+ 		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+ 
+ 	// Scan is now completed.
+ 	// Goroutine now needs to be made runnable.
+ 	// We put it on the global run queue; ready blocks on the global scheduler lock.
+ 	case _Gscanenqueue:
+ 		casfrom_Gscanstatus(gp, _Gscanenqueue, _Gwaiting)
+ 		if gp != getg().m.curg {
+ 			gothrow("processing Gscanenqueue on wrong m")
+ 		}
+ 		dropg()
+ 		ready(gp)
+ 	}
+ }
+ 
+ func stopscanstart(gp *g) {
+ 	_g_ := getg()
+ 	if _g_ == gp {
+ 		gothrow("GC not moved to G0")
+ 	}
+ 	if stopg(gp) {
+ 		if !isscanstatus(readgstatus(gp)) {
+ 			dumpgstatus(gp)
+ 			gothrow("GC not in scan state")
+ 		}
+ 		restartg(gp)
+ 	}
+ }
+ 
+ // Runs on g0 and does the actual work after putting the g back on the run queue.
+ func mquiesce(gpmaster *g) {
 -	activeglen := len(allgs)
+ 	// enqueue the calling goroutine.
+ 	restartg(gpmaster)
++
++	activeglen := len(allgs)
+ 	for i := 0; i < activeglen; i++ {
+ 		gp := allgs[i]
+ 		if readgstatus(gp) == _Gdead {
+ 			gp.gcworkdone = true // noop scan.
+ 		} else {
+ 			gp.gcworkdone = false
+ 		}
+ 		stopscanstart(gp)
+ 	}
+ 
+ 	// Check that the G's gcwork (such as scanning) has been done. If not do it now.
+ 	// You can end up doing work here if the page trap on a Grunning Goroutine has
+ 	// not been sprung or in some race situations. For example a runnable goes dead
+ 	// and is started up again with a gp->gcworkdone set to false.
+ 	for i := 0; i < activeglen; i++ {
+ 		gp := allgs[i]
+ 		for !gp.gcworkdone {
+ 			status := readgstatus(gp)
+ 			if status == _Gdead {
+ 				//do nothing, scan not needed.
+ 				gp.gcworkdone = true // scan is a noop
+ 				break
+ 			}
+ 			if status == _Grunning && gp.stackguard0 == uintptr(stackPreempt) && notetsleep(&sched.stopnote, 100*1000) { // nanosecond arg
+ 				noteclear(&sched.stopnote)
+ 			} else {
+ 				stopscanstart(gp)
+ 			}
+ 		}
+ 	}
+ 
+ 	for i := 0; i < activeglen; i++ {
+ 		gp := allgs[i]
+ 		status := readgstatus(gp)
+ 		if isscanstatus(status) {
+ 			print("mstopandscang:bottom: post scan bad status gp=", gp, " has status ", hex(status), "\n")
+ 			dumpgstatus(gp)
+ 		}
+ 		if !gp.gcworkdone && status != _Gdead {
+ 			print("mstopandscang:bottom: post scan gp=", gp, "->gcworkdone still false\n")
+ 			dumpgstatus(gp)
+ 		}
+ 	}
+ 
+ 	schedule() // Never returns.
+ }
+ 
+ // quiesce moves all the goroutines to a GC safepoint which for now is a at preemption point.
+ // If the global gcphase is GCmark quiesce will ensure that all of the goroutine's stacks
+ // have been scanned before it returns.
+ func quiesce(mastergp *g) {
+ 	castogscanstatus(mastergp, _Grunning, _Gscanenqueue)
+ 	// Now move this to the g0 (aka m) stack.
+ 	// g0 will potentially scan this thread and put mastergp on the runqueue
+ 	mcall(mquiesce)
+ }
+ 
+ // This is used by the GC as well as the routines that do stack dumps. In the case
+ // of GC all the routines can be reliably stopped. This is not always the case
+ // when the system is in panic or being exited.
+ func stoptheworld() {
+ 	_g_ := getg()
+ 
+ 	// If we hold a lock, then we won't be able to stop another M
+ 	// that is blocked trying to acquire the lock.
+ 	if _g_.m.locks > 0 {
+ 		gothrow("stoptheworld: holding locks")
+ 	}
+ 
+ 	lock(&sched.lock)
+ 	sched.stopwait = gomaxprocs
+ 	atomicstore(&sched.gcwaiting, 1)
+ 	preemptall()
+ 	// stop current P
+ 	_g_.m.p.status = _Pgcstop // Pgcstop is only diagnostic.
+ 	sched.stopwait--
+ 	// try to retake all P's in Psyscall status
+ 	for i := 0; i < int(gomaxprocs); i++ {
+ 		p := allp[i]
+ 		s := p.status
+ 		if s == _Psyscall && cas(&p.status, s, _Pgcstop) {
+ 			sched.stopwait--
+ 		}
+ 	}
+ 	// stop idle P's
+ 	for {
+ 		p := pidleget()
+ 		if p == nil {
+ 			break
+ 		}
+ 		p.status = _Pgcstop
+ 		sched.stopwait--
+ 	}
+ 	wait := sched.stopwait > 0
+ 	unlock(&sched.lock)
+ 
+ 	// wait for remaining P's to stop voluntarily
+ 	if wait {
+ 		for {
+ 			// wait for 100us, then try to re-preempt in case of any races
+ 			if notetsleep(&sched.stopnote, 100*1000) {
+ 				noteclear(&sched.stopnote)
+ 				break
+ 			}
+ 			preemptall()
+ 		}
+ 	}
+ 	if sched.stopwait != 0 {
+ 		gothrow("stoptheworld: not stopped")
+ 	}
+ 	for i := 0; i < int(gomaxprocs); i++ {
+ 		p := allp[i]
+ 		if p.status != _Pgcstop {
+ 			gothrow("stoptheworld: not stopped")
+ 		}
+ 	}
+ }
+ 
+ func mhelpgc() {
+ 	_g_ := getg()
+ 	_g_.m.helpgc = -1
+ }
+ 
+ func starttheworld() {
+ 	_g_ := getg()
+ 
+ 	_g_.m.locks++        // disable preemption because it can be holding p in a local var
+ 	gp := netpoll(false) // non-blocking
+ 	injectglist(gp)
+ 	add := needaddgcproc()
+ 	lock(&sched.lock)
+ 	if newprocs != 0 {
+ 		procresize(newprocs)
+ 		newprocs = 0
+ 	} else {
+ 		procresize(gomaxprocs)
+ 	}
+ 	sched.gcwaiting = 0
+ 
+ 	var p1 *p
+ 	for {
+ 		p := pidleget()
+ 		if p == nil {
+ 			break
+ 		}
+ 		// procresize() puts p's with work at the beginning of the list.
+ 		// Once we reach a p without a run queue, the rest don't have one either.
+ 		if p.runqhead == p.runqtail {
+ 			pidleput(p)
+ 			break
+ 		}
+ 		p.m = mget()
+ 		p.link = p1
+ 		p1 = p
+ 	}
+ 	if sched.sysmonwait != 0 {
+ 		sched.sysmonwait = 0
+ 		notewakeup(&sched.sysmonnote)
+ 	}
+ 	unlock(&sched.lock)
+ 
+ 	for p1 != nil {
+ 		p := p1
+ 		p1 = p1.link
+ 		if p.m != nil {
+ 			mp := p.m
+ 			p.m = nil
+ 			if mp.nextp != nil {
+ 				gothrow("starttheworld: inconsistent mp->nextp")
+ 			}
+ 			mp.nextp = p
+ 			notewakeup(&mp.park)
+ 		} else {
+ 			// Start M to run P.  Do not start another M below.
+ 			_newm(nil, p)
+ 			add = false
+ 		}
+ 	}
+ 
+ 	if add {
+ 		// If GC could have used another helper proc, start one now,
+ 		// in the hope that it will be available next time.
+ 		// It would have been even better to start it before the collection,
+ 		// but doing so requires allocating memory, so it's tricky to
+ 		// coordinate.  This lazy approach works out in practice:
+ 		// we don't mind if the first couple gc rounds don't have quite
+ 		// the maximum number of procs.
+ 		_newm(mhelpgc, nil)
+ 	}
+ 	_g_.m.locks--
+ 	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+ 		_g_.stackguard0 = stackPreempt
+ 	}
+ }
+ 
+ // Called to start an M.
+ //go:nosplit
+ func mstart() {
+ 	_g_ := getg()
+ 
+ 	if _g_.stack.lo == 0 {
+ 		// Initialize stack bounds from system stack.
+ 		// Cgo may have left stack size in stack.hi.
+ 		size := _g_.stack.hi
+ 		if size == 0 {
+ 			size = 8192
+ 		}
+ 		_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
+ 		_g_.stack.lo = _g_.stack.hi - size + 1024
+ 	}
+ 	// Initialize stack guards so that we can start calling
+ 	// both Go and C functions with stack growth prologues.
+ 	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+ 	_g_.stackguard1 = _g_.stackguard0
+ 	mstart1()
+ }
+ 
+ func mstart1() {
+ 	_g_ := getg()
+ 
+ 	if _g_ != _g_.m.g0 {
+ 		gothrow("bad runtimeÂ·mstart")
+ 	}
+ 
+ 	// Record top of stack for use by mcall.
+ 	// Once we call schedule we're never coming back,
+ 	// so other calls can reuse this stack space.
+ 	gosave(&_g_.m.g0.sched)
+ 	_g_.m.g0.sched.pc = ^uintptr(0) // make sure it is never used
+ 	asminit()
+ 	minit()
+ 
+ 	// Install signal handlers; after minit so that minit can
+ 	// prepare the thread to be able to handle the signals.
+ 	if _g_.m == &m0 {
+ 		initsig()
+ 	}
+ 
+ 	if _g_.m.mstartfn != nil {
+ 		fn := *(*func())(unsafe.Pointer(&_g_.m.mstartfn))
+ 		fn()
+ 	}
+ 
+ 	if _g_.m.helpgc != 0 {
+ 		_g_.m.helpgc = 0
+ 		stopm()
+ 	} else if _g_.m != &m0 {
+ 		acquirep(_g_.m.nextp)
+ 		_g_.m.nextp = nil
+ 	}
+ 	schedule()
+ 
+ 	// TODO(brainman): This point is never reached, because scheduler
+ 	// does not release os threads at the moment. But once this path
+ 	// is enabled, we must remove our seh here.
+ }
+ 
+ // When running with cgo, we call _cgo_thread_start
+ // to start threads for us so that we can play nicely with
+ // foreign code.
+ var cgoThreadStart unsafe.Pointer
+ 
+ type cgothreadstart struct {
+ 	g   *g
+ 	tls *uint64
+ 	fn  unsafe.Pointer
+ }
+ 
+ // Allocate a new m unassociated with any thread.
+ // Can use p for allocation context if needed.
+ func allocm(_p_ *p) *m {
+ 	_g_ := getg()
+ 	_g_.m.locks++ // disable GC because it can be called from sysmon
+ 	if _g_.m.p == nil {
+ 		acquirep(_p_) // temporarily borrow p for mallocs in this function
+ 	}
+ 	mp := newM()
+ 	mcommoninit(mp)
+ 
+ 	// In case of cgo or Solaris, pthread_create will make us a stack.
+ 	// Windows and Plan 9 will layout sched stack on OS stack.
+ 	if iscgo || GOOS == "solaris" || GOOS == "windows" || GOOS == "plan9" {
+ 		mp.g0 = malg(-1)
+ 	} else {
+ 		mp.g0 = malg(8192)
+ 	}
+ 	mp.g0.m = mp
+ 
+ 	if _p_ == _g_.m.p {
+ 		releasep()
+ 	}
+ 	_g_.m.locks--
+ 	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+ 		_g_.stackguard0 = stackPreempt
+ 	}
+ 
+ 	return mp
+ }
+ 
+ func allocg() *g {
+ 	return newG()
+ }
+ 
+ // needm is called when a cgo callback happens on a
+ // thread without an m (a thread not created by Go).
+ // In this case, needm is expected to find an m to use
+ // and return with m, g initialized correctly.
+ // Since m and g are not set now (likely nil, but see below)
+ // needm is limited in what routines it can call. In particular
+ // it can only call nosplit functions (textflag 7) and cannot
+ // do any scheduling that requires an m.
+ //
+ // In order to avoid needing heavy lifting here, we adopt
+ // the following strategy: there is a stack of available m's
+ // that can be stolen. Using compare-and-swap
+ // to pop from the stack has ABA races, so we simulate
+ // a lock by doing an exchange (via casp) to steal the stack
+ // head and replace the top pointer with MLOCKED (1).
+ // This serves as a simple spin lock that we can use even
+ // without an m. The thread that locks the stack in this way
+ // unlocks the stack by storing a valid stack head pointer.
+ //
+ // In order to make sure that there is always an m structure
+ // available to be stolen, we maintain the invariant that there
+ // is always one more than needed. At the beginning of the
+ // program (if cgo is in use) the list is seeded with a single m.
+ // If needm finds that it has taken the last m off the list, its job
+ // is - once it has installed its own m so that it can do things like
+ // allocate memory - to create a spare m and put it on the list.
+ //
+ // Each of these extra m's also has a g0 and a curg that are
+ // pressed into service as the scheduling stack and current
+ // goroutine for the duration of the cgo callback.
+ //
+ // When the callback is done with the m, it calls dropm to
+ // put the m back on the list.
+ //go:nosplit
+ func needm(x byte) {
+ 	if needextram != 0 {
+ 		// Can happen if C/C++ code calls Go from a global ctor.
+ 		// Can not throw, because scheduler is not initialized yet.
+ 		// XXX
+ 		// write(2, unsafe.Pointer("fatal error: cgo callback before cgo call\n"), sizeof("fatal error: cgo callback before cgo call\n") - 1)
+ 		exit(1)
+ 	}
+ 
+ 	// Lock extra list, take head, unlock popped list.
+ 	// nilokay=false is safe here because of the invariant above,
+ 	// that the extra list always contains or will soon contain
+ 	// at least one m.
+ 	mp := lockextra(false)
+ 
+ 	// Set needextram when we've just emptied the list,
+ 	// so that the eventual call into cgocallbackg will
+ 	// allocate a new m for the extra list. We delay the
+ 	// allocation until then so that it can be done
+ 	// after exitsyscall makes sure it is okay to be
+ 	// running at all (that is, there's no garbage collection
+ 	// running right now).
+ 	mp.needextram = mp.schedlink == nil
+ 	unlockextra(mp.schedlink)
+ 
+ 	// Install g (= m->g0) and set the stack bounds
+ 	// to match the current stack. We don't actually know
+ 	// how big the stack is, like we don't know how big any
+ 	// scheduling stack is, but we assume there's at least 32 kB,
+ 	// which is more than enough for us.
+ 	setg(mp.g0)
+ 	_g_ := getg()
+ 	_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&x))) + 1024
+ 	_g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
+ 	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+ 
+ 	// Initialize this thread to use the m.
+ 	asminit()
+ 	minit()
+ }
+ 
+ // newextram allocates an m and puts it on the extra list.
+ // It is called with a working local m, so that it can do things
+ // like call schedlock and allocate.
+ func newextram() {
+ 	// Create extra goroutine locked to extra m.
+ 	// The goroutine is the context in which the cgo callback will run.
+ 	// The sched.pc will never be returned to, but setting it to
+ 	// goexit makes clear to the traceback routines where
+ 	// the goroutine stack ends.
+ 	mp := allocm(nil)
+ 	gp := malg(4096)
+ 	gp.sched.pc = funcPC(goexit) + _PCQuantum
+ 	gp.sched.sp = gp.stack.hi
+ 	gp.sched.sp -= 4 * regSize // extra space in case of reads slightly beyond frame
+ 	gp.sched.lr = 0
+ 	gp.sched.g = gp
+ 	gp.syscallpc = gp.sched.pc
+ 	gp.syscallsp = gp.sched.sp
+ 	// malg returns status as Gidle, change to Gsyscall before adding to allg
+ 	// where GC will see it.
+ 	casgstatus(gp, _Gidle, _Gsyscall)
+ 	gp.m = mp
+ 	mp.curg = gp
+ 	mp.locked = _LockInternal
+ 	mp.lockedg = gp
+ 	gp.lockedm = mp
+ 	gp.goid = int64(xadd64(&sched.goidgen, 1))
+ 	if raceenabled {
+ 		gp.racectx = racegostart(funcPC(newextram))
+ 	}
+ 	// put on allg for garbage collector
+ 	allgadd(gp)
+ 
+ 	// Add m to the extra list.
+ 	mnext := lockextra(true)
+ 	mp.schedlink = mnext
+ 	unlockextra(mp)
+ }
+ 
+ // dropm is called when a cgo callback has called needm but is now
+ // done with the callback and returning back into the non-Go thread.
+ // It puts the current m back onto the extra list.
+ //
+ // The main expense here is the call to signalstack to release the
+ // m's signal stack, and then the call to needm on the next callback
+ // from this thread. It is tempting to try to save the m for next time,
+ // which would eliminate both these costs, but there might not be
+ // a next time: the current thread (which Go does not control) might exit.
+ // If we saved the m for that thread, there would be an m leak each time
+ // such a thread exited. Instead, we acquire and release an m on each
+ // call. These should typically not be scheduling operations, just a few
+ // atomics, so the cost should be small.
+ //
+ // TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
+ // variable using pthread_key_create. Unlike the pthread keys we already use
+ // on OS X, this dummy key would never be read by Go code. It would exist
+ // only so that we could register at thread-exit-time destructor.
+ // That destructor would put the m back onto the extra list.
+ // This is purely a performance optimization. The current version,
+ // in which dropm happens on each cgo call, is still correct too.
+ // We may have to keep the current version on systems with cgo
+ // but without pthreads, like Windows.
+ func dropm() {
+ 	// Undo whatever initialization minit did during needm.
+ 	unminit()
+ 
+ 	// Clear m and g, and return m to the extra list.
+ 	// After the call to setmg we can only call nosplit functions.
+ 	mp := getg().m
+ 	setg(nil)
+ 
+ 	mnext := lockextra(true)
+ 	mp.schedlink = mnext
+ 	unlockextra(mp)
+ }
+ 
+ var extram uintptr
+ 
+ // lockextra locks the extra list and returns the list head.
+ // The caller must unlock the list by storing a new list head
+ // to extram. If nilokay is true, then lockextra will
+ // return a nil list head if that's what it finds. If nilokay is false,
+ // lockextra will keep waiting until the list head is no longer nil.
+ //go:nosplit
+ func lockextra(nilokay bool) *m {
+ 	const locked = 1
+ 
+ 	for {
+ 		old := atomicloaduintptr(&extram)
+ 		if old == locked {
+ 			yield := osyield
+ 			yield()
+ 			continue
+ 		}
+ 		if old == 0 && !nilokay {
+ 			usleep(1)
+ 			continue
+ 		}
+ 		if casuintptr(&extram, old, locked) {
+ 			return (*m)(unsafe.Pointer(old))
+ 		}
+ 		yield := osyield
+ 		yield()
+ 		continue
+ 	}
+ }
+ 
+ //go:nosplit
+ func unlockextra(mp *m) {
+ 	atomicstoreuintptr(&extram, uintptr(unsafe.Pointer(mp)))
+ }
+ 
+ // Create a new m.  It will start off with a call to fn, or else the scheduler.
+ func _newm(fn func(), _p_ *p) {
+ 	mp := allocm(_p_)
+ 	mp.nextp = _p_
+ 	mp.mstartfn = *(*unsafe.Pointer)(unsafe.Pointer(&fn))
+ 
+ 	if iscgo {
+ 		var ts cgothreadstart
+ 		if _cgo_thread_start == nil {
+ 			gothrow("_cgo_thread_start missing")
+ 		}
+ 		ts.g = mp.g0
+ 		ts.tls = (*uint64)(unsafe.Pointer(&mp.tls[0]))
+ 		ts.fn = unsafe.Pointer(funcPC(mstart))
+ 		asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts))
+ 		return
+ 	}
+ 	newosproc(mp, unsafe.Pointer(mp.g0.stack.hi))
+ }
+ 
+ // Stops execution of the current m until new work is available.
+ // Returns with acquired P.
+ func stopm() {
+ 	_g_ := getg()
+ 
+ 	if _g_.m.locks != 0 {
+ 		gothrow("stopm holding locks")
+ 	}
+ 	if _g_.m.p != nil {
+ 		gothrow("stopm holding p")
+ 	}
+ 	if _g_.m.spinning {
+ 		_g_.m.spinning = false
+ 		xadd(&sched.nmspinning, -1)
+ 	}
+ 
+ retry:
+ 	lock(&sched.lock)
+ 	mput(_g_.m)
+ 	unlock(&sched.lock)
+ 	notesleep(&_g_.m.park)
+ 	noteclear(&_g_.m.park)
+ 	if _g_.m.helpgc != 0 {
+ 		gchelper()
+ 		_g_.m.helpgc = 0
+ 		_g_.m.mcache = nil
+ 		goto retry
+ 	}
+ 	acquirep(_g_.m.nextp)
+ 	_g_.m.nextp = nil
+ }
+ 
+ func mspinning() {
+ 	getg().m.spinning = true
+ }
+ 
+ // Schedules some M to run the p (creates an M if necessary).
+ // If p==nil, tries to get an idle P, if no idle P's does nothing.
+ func startm(_p_ *p, spinning bool) {
+ 	lock(&sched.lock)
+ 	if _p_ == nil {
+ 		_p_ = pidleget()
+ 		if _p_ == nil {
+ 			unlock(&sched.lock)
+ 			if spinning {
+ 				xadd(&sched.nmspinning, -1)
+ 			}
+ 			return
+ 		}
+ 	}
+ 	mp := mget()
+ 	unlock(&sched.lock)
+ 	if mp == nil {
+ 		var fn func()
+ 		if spinning {
+ 			fn = mspinning
+ 		}
+ 		_newm(fn, _p_)
+ 		return
+ 	}
+ 	if mp.spinning {
+ 		gothrow("startm: m is spinning")
+ 	}
+ 	if mp.nextp != nil {
+ 		gothrow("startm: m has p")
+ 	}
+ 	mp.spinning = spinning
+ 	mp.nextp = _p_
+ 	notewakeup(&mp.park)
+ }
+ 
+ // Hands off P from syscall or locked M.
+ func handoffp(_p_ *p) {
+ 	// if it has local work, start it straight away
+ 	if _p_.runqhead != _p_.runqtail || sched.runqsize != 0 {
+ 		startm(_p_, false)
+ 		return
+ 	}
+ 	// no local work, check that there are no spinning/idle M's,
+ 	// otherwise our help is not required
+ 	if atomicload(&sched.nmspinning)+atomicload(&sched.npidle) == 0 && cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
+ 		startm(_p_, true)
+ 		return
+ 	}
+ 	lock(&sched.lock)
+ 	if sched.gcwaiting != 0 {
+ 		_p_.status = _Pgcstop
+ 		sched.stopwait--
+ 		if sched.stopwait == 0 {
+ 			notewakeup(&sched.stopnote)
+ 		}
+ 		unlock(&sched.lock)
+ 		return
+ 	}
+ 	if sched.runqsize != 0 {
+ 		unlock(&sched.lock)
+ 		startm(_p_, false)
+ 		return
+ 	}
+ 	// If this is the last running P and nobody is polling network,
+ 	// need to wakeup another M to poll network.
+ 	if sched.npidle == uint32(gomaxprocs-1) && atomicload64(&sched.lastpoll) != 0 {
+ 		unlock(&sched.lock)
+ 		startm(_p_, false)
+ 		return
+ 	}
+ 	pidleput(_p_)
+ 	unlock(&sched.lock)
+ }
+ 
+ // Tries to add one more P to execute G's.
+ // Called when a G is made runnable (newproc, ready).
+ func wakep() {
+ 	// be conservative about spinning threads
+ 	if !cas(&sched.nmspinning, 0, 1) {
+ 		return
+ 	}
+ 	startm(nil, true)
+ }
+ 
+ // Stops execution of the current m that is locked to a g until the g is runnable again.
+ // Returns with acquired P.
+ func stoplockedm() {
+ 	_g_ := getg()
+ 
+ 	if _g_.m.lockedg == nil || _g_.m.lockedg.lockedm != _g_.m {
+ 		gothrow("stoplockedm: inconsistent locking")
+ 	}
+ 	if _g_.m.p != nil {
+ 		// Schedule another M to run this p.
+ 		_p_ := releasep()
+ 		handoffp(_p_)
+ 	}
+ 	incidlelocked(1)
+ 	// Wait until another thread schedules lockedg again.
+ 	notesleep(&_g_.m.park)
+ 	noteclear(&_g_.m.park)
+ 	status := readgstatus(_g_.m.lockedg)
+ 	if status&^_Gscan != _Grunnable {
+ 		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
+ 		dumpgstatus(_g_)
+ 		gothrow("stoplockedm: not runnable")
+ 	}
+ 	acquirep(_g_.m.nextp)
+ 	_g_.m.nextp = nil
+ }
+ 
+ // Schedules the locked m to run the locked gp.
+ func startlockedm(gp *g) {
+ 	_g_ := getg()
+ 
+ 	mp := gp.lockedm
+ 	if mp == _g_.m {
+ 		gothrow("startlockedm: locked to me")
+ 	}
+ 	if mp.nextp != nil {
+ 		gothrow("startlockedm: m has p")
+ 	}
+ 	// directly handoff current P to the locked m
+ 	incidlelocked(-1)
+ 	_p_ := releasep()
+ 	mp.nextp = _p_
+ 	notewakeup(&mp.park)
+ 	stopm()
+ }
+ 
+ // Stops the current m for stoptheworld.
+ // Returns when the world is restarted.
+ func gcstopm() {
+ 	_g_ := getg()
+ 
+ 	if sched.gcwaiting == 0 {
+ 		gothrow("gcstopm: not waiting for gc")
+ 	}
+ 	if _g_.m.spinning {
+ 		_g_.m.spinning = false
+ 		xadd(&sched.nmspinning, -1)
+ 	}
+ 	_p_ := releasep()
+ 	lock(&sched.lock)
+ 	_p_.status = _Pgcstop
+ 	sched.stopwait--
+ 	if sched.stopwait == 0 {
+ 		notewakeup(&sched.stopnote)
+ 	}
+ 	unlock(&sched.lock)
+ 	stopm()
+ }
+ 
+ // Schedules gp to run on the current M.
+ // Never returns.
+ func execute(gp *g) {
+ 	_g_ := getg()
+ 
+ 	casgstatus(gp, _Grunnable, _Grunning)
+ 	gp.waitsince = 0
+ 	gp.preempt = false
+ 	gp.stackguard0 = gp.stack.lo + _StackGuard
+ 	_g_.m.p.schedtick++
+ 	_g_.m.curg = gp
+ 	gp.m = _g_.m
+ 
+ 	// Check whether the profiler needs to be turned on or off.
+ 	hz := sched.profilehz
+ 	if _g_.m.profilehz != hz {
+ 		resetcpuprofiler(hz)
+ 	}
+ 
+ 	gogo(&gp.sched)
+ }
+ 
+ // Finds a runnable goroutine to execute.
+ // Tries to steal from other P's, get g from global queue, poll network.
+ func findrunnable() *g {
+ 	_g_ := getg()
+ 
+ top:
+ 	if sched.gcwaiting != 0 {
+ 		gcstopm()
+ 		goto top
+ 	}
+ 	if fingwait && fingwake {
+ 		if gp := wakefing(); gp != nil {
+ 			ready(gp)
+ 		}
+ 	}
+ 
+ 	// local runq
+ 	if gp := runqget(_g_.m.p); gp != nil {
+ 		return gp
+ 	}
+ 
+ 	// global runq
+ 	if sched.runqsize != 0 {
+ 		lock(&sched.lock)
+ 		gp := globrunqget(_g_.m.p, 0)
+ 		unlock(&sched.lock)
+ 		if gp != nil {
+ 			return gp
+ 		}
+ 	}
+ 
+ 	// poll network - returns list of goroutines
+ 	if gp := netpoll(false); gp != nil { // non-blocking
+ 		injectglist(gp.schedlink)
+ 		casgstatus(gp, _Gwaiting, _Grunnable)
+ 		return gp
+ 	}
+ 
+ 	// If number of spinning M's >= number of busy P's, block.
+ 	// This is necessary to prevent excessive CPU consumption
+ 	// when GOMAXPROCS>>1 but the program parallelism is low.
+ 	if !_g_.m.spinning && 2*atomicload(&sched.nmspinning) >= uint32(gomaxprocs)-atomicload(&sched.npidle) { // TODO: fast atomic
+ 		goto stop
+ 	}
+ 	if !_g_.m.spinning {
+ 		_g_.m.spinning = true
+ 		xadd(&sched.nmspinning, 1)
+ 	}
+ 	// random steal from other P's
+ 	for i := 0; i < int(2*gomaxprocs); i++ {
+ 		if sched.gcwaiting != 0 {
+ 			goto top
+ 		}
+ 		_p_ := allp[fastrand1()%uint32(gomaxprocs)]
+ 		var gp *g
+ 		if _p_ == _g_.m.p {
+ 			gp = runqget(_p_)
+ 		} else {
+ 			gp = runqsteal(_g_.m.p, _p_)
+ 		}
+ 		if gp != nil {
+ 			return gp
+ 		}
+ 	}
+ stop:
+ 
+ 	// return P and block
+ 	lock(&sched.lock)
+ 	if sched.gcwaiting != 0 {
+ 		unlock(&sched.lock)
+ 		goto top
+ 	}
+ 	if sched.runqsize != 0 {
+ 		gp := globrunqget(_g_.m.p, 0)
+ 		unlock(&sched.lock)
+ 		return gp
+ 	}
+ 	_p_ := releasep()
+ 	pidleput(_p_)
+ 	unlock(&sched.lock)
+ 	if _g_.m.spinning {
+ 		_g_.m.spinning = false
+ 		xadd(&sched.nmspinning, -1)
+ 	}
+ 
+ 	// check all runqueues once again
+ 	for i := 0; i < int(gomaxprocs); i++ {
+ 		_p_ := allp[i]
+ 		if _p_ != nil && _p_.runqhead != _p_.runqtail {
+ 			lock(&sched.lock)
+ 			_p_ = pidleget()
+ 			unlock(&sched.lock)
+ 			if _p_ != nil {
+ 				acquirep(_p_)
+ 				goto top
+ 			}
+ 			break
+ 		}
+ 	}
+ 
+ 	// poll network
+ 	if xchg64(&sched.lastpoll, 0) != 0 {
+ 		if _g_.m.p != nil {
+ 			gothrow("findrunnable: netpoll with p")
+ 		}
+ 		if _g_.m.spinning {
+ 			gothrow("findrunnable: netpoll with spinning")
+ 		}
+ 		gp := netpoll(true) // block until new work is available
+ 		atomicstore64(&sched.lastpoll, uint64(nanotime()))
+ 		if gp != nil {
+ 			lock(&sched.lock)
+ 			_p_ = pidleget()
+ 			unlock(&sched.lock)
+ 			if _p_ != nil {
+ 				acquirep(_p_)
+ 				injectglist(gp.schedlink)
+ 				casgstatus(gp, _Gwaiting, _Grunnable)
+ 				return gp
+ 			}
+ 			injectglist(gp)
+ 		}
+ 	}
+ 	stopm()
+ 	goto top
+ }
+ 
+ func resetspinning() {
+ 	_g_ := getg()
+ 
+ 	var nmspinning uint32
+ 	if _g_.m.spinning {
+ 		_g_.m.spinning = false
+ 		nmspinning = xadd(&sched.nmspinning, -1)
+ 		if nmspinning < 0 {
+ 			gothrow("findrunnable: negative nmspinning")
+ 		}
+ 	} else {
+ 		nmspinning = atomicload(&sched.nmspinning)
+ 	}
+ 
+ 	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
+ 	// so see if we need to wakeup another P here.
+ 	if nmspinning == 0 && atomicload(&sched.npidle) > 0 {
+ 		wakep()
+ 	}
+ }
+ 
+ // Injects the list of runnable G's into the scheduler.
+ // Can run concurrently with GC.
+ func injectglist(glist *g) {
+ 	if glist == nil {
+ 		return
+ 	}
+ 	lock(&sched.lock)
+ 	var n int
+ 	for n = 0; glist != nil; n++ {
+ 		gp := glist
+ 		glist = gp.schedlink
+ 		casgstatus(gp, _Gwaiting, _Grunnable)
+ 		globrunqput(gp)
+ 	}
+ 	unlock(&sched.lock)
+ 	for ; n != 0 && sched.npidle != 0; n-- {
+ 		startm(nil, false)
+ 	}
+ }
+ 
+ // One round of scheduler: find a runnable goroutine and execute it.
+ // Never returns.
+ func schedule() {
+ 	_g_ := getg()
+ 
+ 	if _g_.m.locks != 0 {
+ 		gothrow("schedule: holding locks")
+ 	}
+ 
+ 	if _g_.m.lockedg != nil {
+ 		stoplockedm()
+ 		execute(_g_.m.lockedg) // Never returns.
+ 	}
+ 
+ top:
+ 	if sched.gcwaiting != 0 {
+ 		gcstopm()
+ 		goto top
+ 	}
+ 
+ 	var gp *g
+ 	// Check the global runnable queue once in a while to ensure fairness.
+ 	// Otherwise two goroutines can completely occupy the local runqueue
+ 	// by constantly respawning each other.
+ 	tick := _g_.m.p.schedtick
+ 	// This is a fancy way to say tick%61==0,
+ 	// it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
+ 	if uint64(tick)-((uint64(tick)*0x4325c53f)>>36)*61 == 0 && sched.runqsize > 0 {
+ 		lock(&sched.lock)
+ 		gp = globrunqget(_g_.m.p, 1)
+ 		unlock(&sched.lock)
+ 		if gp != nil {
+ 			resetspinning()
+ 		}
+ 	}
+ 	if gp == nil {
+ 		gp = runqget(_g_.m.p)
+ 		if gp != nil && _g_.m.spinning {
+ 			gothrow("schedule: spinning with local work")
+ 		}
+ 	}
+ 	if gp == nil {
+ 		gp = findrunnable() // blocks until work is available
+ 		resetspinning()
+ 	}
+ 
+ 	if gp.lockedm != nil {
+ 		// Hands off own p to the locked m,
+ 		// then blocks waiting for a new p.
+ 		startlockedm(gp)
+ 		goto top
+ 	}
+ 
+ 	execute(gp)
+ }
+ 
+ // dropg removes the association between m and the current goroutine m->curg (gp for short).
+ // Typically a caller sets gp's status away from Grunning and then
+ // immediately calls dropg to finish the job. The caller is also responsible
+ // for arranging that gp will be restarted using ready at an
+ // appropriate time. After calling dropg and arranging for gp to be
+ // readied later, the caller can do other work but eventually should
+ // call schedule to restart the scheduling of goroutines on this m.
+ func dropg() {
+ 	_g_ := getg()
+ 
+ 	if _g_.m.lockedg == nil {
+ 		_g_.m.curg.m = nil
+ 		_g_.m.curg = nil
+ 	}
+ }
+ 
+ // Puts the current goroutine into a waiting state and calls unlockf.
+ // If unlockf returns false, the goroutine is resumed.
+ func park(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason string) {
+ 	_g_ := getg()
+ 
+ 	_g_.m.waitlock = lock
+ 	_g_.m.waitunlockf = *(*unsafe.Pointer)(unsafe.Pointer(&unlockf))
+ 	_g_.waitreason = reason
+ 	mcall(park_m)
+ }
+ 
+ func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
+ 	unlock((*mutex)(lock))
+ 	return true
+ }
+ 
+ // Puts the current goroutine into a waiting state and unlocks the lock.
+ // The goroutine can be made runnable again by calling ready(gp).
+ func parkunlock(lock *mutex, reason string) {
+ 	park(parkunlock_c, unsafe.Pointer(lock), reason)
+ }
+ 
+ // park continuation on g0.
+ func park_m(gp *g) {
+ 	_g_ := getg()
+ 
+ 	casgstatus(gp, _Grunning, _Gwaiting)
+ 	dropg()
+ 
+ 	if _g_.m.waitunlockf != nil {
+ 		fn := *(*func(*g, unsafe.Pointer) bool)(unsafe.Pointer(&_g_.m.waitunlockf))
+ 		ok := fn(gp, _g_.m.waitlock)
+ 		_g_.m.waitunlockf = nil
+ 		_g_.m.waitlock = nil
+ 		if !ok {
+ 			casgstatus(gp, _Gwaiting, _Grunnable)
+ 			execute(gp) // Schedule it back, never returns.
+ 		}
+ 	}
+ 	schedule()
+ }
+ 
+ // Gosched continuation on g0.
+ func gosched_m(gp *g) {
+ 	status := readgstatus(gp)
+ 	if status&^_Gscan != _Grunning {
+ 		dumpgstatus(gp)
+ 		gothrow("bad g status")
+ 	}
+ 	casgstatus(gp, _Grunning, _Grunnable)
+ 	dropg()
+ 	lock(&sched.lock)
+ 	globrunqput(gp)
+ 	unlock(&sched.lock)
+ 
+ 	schedule()
+ }
+ 
+ // Finishes execution of the current goroutine.
+ // Must be NOSPLIT because it is called from Go. (TODO - probably not anymore)
+ //go:nosplit
+ func goexit1() {
+ 	if raceenabled {
+ 		racegoend()
+ 	}
+ 	mcall(goexit0)
+ }
+ 
+ // goexit continuation on g0.
+ func goexit0(gp *g) {
+ 	_g_ := getg()
+ 
+ 	casgstatus(gp, _Grunning, _Gdead)
+ 	gp.m = nil
+ 	gp.lockedm = nil
+ 	_g_.m.lockedg = nil
+ 	gp.paniconfault = false
+ 	gp._defer = nil // should be true already but just in case.
+ 	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
+ 	gp.writebuf = nil
+ 	gp.waitreason = ""
+ 	gp.param = nil
+ 
+ 	dropg()
+ 
+ 	if _g_.m.locked&^_LockExternal != 0 {
+ 		print("invalid m->locked = ", _g_.m.locked, "\n")
+ 		gothrow("internal lockOSThread error")
+ 	}
+ 	_g_.m.locked = 0
+ 	gfput(_g_.m.p, gp)
+ 	schedule()
+ }
+ 
+ //go:nosplit
+ func save(pc, sp uintptr) {
+ 	_g_ := getg()
+ 
+ 	_g_.sched.pc = pc
+ 	_g_.sched.sp = sp
+ 	_g_.sched.lr = 0
+ 	_g_.sched.ret = 0
+ 	_g_.sched.ctxt = nil
 -	_g_.sched.g = _g_
++	// write as uintptr to avoid write barrier, which will smash _g_.sched.
++	*(*uintptr)(unsafe.Pointer(&_g_.sched.g)) = uintptr(unsafe.Pointer(_g_))
+ }
+ 
+ // The goroutine g is about to enter a system call.
+ // Record that it's not using the cpu anymore.
+ // This is called only from the go syscall library and cgocall,
+ // not from the low-level system calls used by the
+ //
+ // Entersyscall cannot split the stack: the gosave must
+ // make g->sched refer to the caller's stack segment, because
+ // entersyscall is going to return immediately after.
+ //
+ // Nothing entersyscall calls can split the stack either.
+ // We cannot safely move the stack during an active call to syscall,
+ // because we do not know which of the uintptr arguments are
+ // really pointers (back into the stack).
+ // In practice, this means that we make the fast path run through
+ // entersyscall doing no-split things, and the slow path has to use systemstack
+ // to run bigger things on the system stack.
+ //
+ // reentersyscall is the entry point used by cgo callbacks, where explicitly
+ // saved SP and PC are restored. This is needed when exitsyscall will be called
+ // from a function further up in the call stack than the parent, as g->syscallsp
+ // must always point to a valid stack frame. entersyscall below is the normal
+ // entry point for syscalls, which obtains the SP and PC from the caller.
+ //go:nosplit
+ func reentersyscall(pc, sp uintptr) {
+ 	_g_ := getg()
+ 
+ 	// Disable preemption because during this function g is in Gsyscall status,
+ 	// but can have inconsistent g->sched, do not let GC observe it.
+ 	_g_.m.locks++
+ 
+ 	// Entersyscall must not call any function that might split/grow the stack.
+ 	// (See details in comment above.)
+ 	// Catch calls that might, by replacing the stack guard with something that
+ 	// will trip any stack check and leaving a flag to tell newstack to die.
+ 	_g_.stackguard0 = stackPreempt
+ 	_g_.throwsplit = true
+ 
+ 	// Leave SP around for GC and traceback.
+ 	save(pc, sp)
+ 	_g_.syscallsp = sp
+ 	_g_.syscallpc = pc
+ 	casgstatus(_g_, _Grunning, _Gsyscall)
+ 	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
 -		systemstack(entersyscall_bad)
++		systemstack(func() {
++			print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
++			gothrow("entersyscall")
++		})
+ 	}
+ 
+ 	if atomicload(&sched.sysmonwait) != 0 { // TODO: fast atomic
+ 		systemstack(entersyscall_sysmon)
+ 		save(pc, sp)
+ 	}
+ 
+ 	_g_.m.mcache = nil
+ 	_g_.m.p.m = nil
+ 	atomicstore(&_g_.m.p.status, _Psyscall)
+ 	if sched.gcwaiting != 0 {
+ 		systemstack(entersyscall_gcwait)
+ 		save(pc, sp)
+ 	}
+ 
+ 	// Goroutines must not split stacks in Gsyscall status (it would corrupt g->sched).
+ 	// We set _StackGuard to StackPreempt so that first split stack check calls morestack.
+ 	// Morestack detects this case and throws.
+ 	_g_.stackguard0 = stackPreempt
+ 	_g_.m.locks--
+ }
+ 
+ // Standard syscall entry used by the go syscall library and normal cgo calls.
+ //go:nosplit
+ func entersyscall(dummy int32) {
+ 	reentersyscall(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+ }
+ 
 -func entersyscall_bad() {
 -	var gp *g
 -	gp = getg().m.curg
 -	print("entersyscall inconsistent ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
 -	gothrow("entersyscall")
 -}
 -
+ func entersyscall_sysmon() {
+ 	lock(&sched.lock)
+ 	if atomicload(&sched.sysmonwait) != 0 {
+ 		atomicstore(&sched.sysmonwait, 0)
+ 		notewakeup(&sched.sysmonnote)
+ 	}
+ 	unlock(&sched.lock)
+ }
+ 
+ func entersyscall_gcwait() {
+ 	_g_ := getg()
+ 
+ 	lock(&sched.lock)
+ 	if sched.stopwait > 0 && cas(&_g_.m.p.status, _Psyscall, _Pgcstop) {
+ 		if sched.stopwait--; sched.stopwait == 0 {
+ 			notewakeup(&sched.stopnote)
+ 		}
+ 	}
+ 	unlock(&sched.lock)
+ }
+ 
+ // The same as entersyscall(), but with a hint that the syscall is blocking.
+ //go:nosplit
+ func entersyscallblock(dummy int32) {
+ 	_g_ := getg()
+ 
+ 	_g_.m.locks++ // see comment in entersyscall
+ 	_g_.throwsplit = true
+ 	_g_.stackguard0 = stackPreempt // see comment in entersyscall
+ 
+ 	// Leave SP around for GC and traceback.
 -	save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
++	pc := getcallerpc(unsafe.Pointer(&dummy))
++	sp := getcallersp(unsafe.Pointer(&dummy))
++	save(pc, sp)
+ 	_g_.syscallsp = _g_.sched.sp
+ 	_g_.syscallpc = _g_.sched.pc
++	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
++		sp1 := sp
++		sp2 := _g_.sched.sp
++		sp3 := _g_.syscallsp
++		systemstack(func() {
++			print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
++			gothrow("entersyscallblock")
++		})
++	}
+ 	casgstatus(_g_, _Grunning, _Gsyscall)
+ 	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
 -		systemstack(entersyscall_bad)
++		systemstack(func() {
++			print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
++			gothrow("entersyscallblock")
++		})
+ 	}
+ 
+ 	systemstack(entersyscallblock_handoff)
+ 
+ 	// Resave for traceback during blocked call.
+ 	save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+ 
+ 	_g_.m.locks--
+ }
+ 
+ func entersyscallblock_handoff() {
+ 	handoffp(releasep())
+ }
+ 
+ // The goroutine g exited its system call.
+ // Arrange for it to run on a cpu again.
+ // This is called only from the go syscall library, not
+ // from the low-level system calls used by the
+ //go:nosplit
+ func exitsyscall(dummy int32) {
+ 	_g_ := getg()
+ 
+ 	_g_.m.locks++ // see comment in entersyscall
+ 	if getcallersp(unsafe.Pointer(&dummy)) > _g_.syscallsp {
+ 		gothrow("exitsyscall: syscall frame is no longer valid")
+ 	}
+ 
+ 	_g_.waitsince = 0
+ 	if exitsyscallfast() {
+ 		if _g_.m.mcache == nil {
+ 			gothrow("lost mcache")
+ 		}
+ 		// There's a cpu for us, so we can run.
+ 		_g_.m.p.syscalltick++
+ 		// We need to cas the status and scan before resuming...
+ 		casgstatus(_g_, _Gsyscall, _Grunning)
+ 
+ 		// Garbage collector isn't running (since we are),
+ 		// so okay to clear syscallsp.
+ 		_g_.syscallsp = 0
+ 		_g_.m.locks--
+ 		if _g_.preempt {
+ 			// restore the preemption request in case we've cleared it in newstack
+ 			_g_.stackguard0 = stackPreempt
+ 		} else {
+ 			// otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
+ 			_g_.stackguard0 = _g_.stack.lo + _StackGuard
+ 		}
+ 		_g_.throwsplit = false
+ 		return
+ 	}
+ 
+ 	_g_.m.locks--
+ 
+ 	// Call the scheduler.
+ 	mcall(exitsyscall0)
+ 
+ 	if _g_.m.mcache == nil {
+ 		gothrow("lost mcache")
+ 	}
+ 
+ 	// Scheduler returned, so we're allowed to run now.
+ 	// Delete the syscallsp information that we left for
+ 	// the garbage collector during the system call.
+ 	// Must wait until now because until gosched returns
+ 	// we don't know for sure that the garbage collector
+ 	// is not running.
+ 	_g_.syscallsp = 0
+ 	_g_.m.p.syscalltick++
+ 	_g_.throwsplit = false
+ }
+ 
+ //go:nosplit
+ func exitsyscallfast() bool {
+ 	_g_ := getg()
+ 
+ 	// Freezetheworld sets stopwait but does not retake P's.
+ 	if sched.stopwait != 0 {
++		_g_.m.mcache = nil
+ 		_g_.m.p = nil
+ 		return false
+ 	}
+ 
+ 	// Try to re-acquire the last P.
+ 	if _g_.m.p != nil && _g_.m.p.status == _Psyscall && cas(&_g_.m.p.status, _Psyscall, _Prunning) {
+ 		// There's a cpu for us, so we can run.
+ 		_g_.m.mcache = _g_.m.p.mcache
+ 		_g_.m.p.m = _g_.m
+ 		return true
+ 	}
+ 
+ 	// Try to get any other idle P.
++	_g_.m.mcache = nil
+ 	_g_.m.p = nil
+ 	if sched.pidle != nil {
+ 		var ok bool
+ 		systemstack(func() {
+ 			ok = exitsyscallfast_pidle()
+ 		})
+ 		if ok {
+ 			return true
+ 		}
+ 	}
+ 	return false
+ }
+ 
+ func exitsyscallfast_pidle() bool {
+ 	lock(&sched.lock)
+ 	_p_ := pidleget()
+ 	if _p_ != nil && atomicload(&sched.sysmonwait) != 0 {
+ 		atomicstore(&sched.sysmonwait, 0)
+ 		notewakeup(&sched.sysmonnote)
+ 	}
+ 	unlock(&sched.lock)
+ 	if _p_ != nil {
+ 		acquirep(_p_)
+ 		return true
+ 	}
+ 	return false
+ }
+ 
+ // exitsyscall slow path on g0.
+ // Failed to acquire P, enqueue gp as runnable.
+ func exitsyscall0(gp *g) {
+ 	_g_ := getg()
+ 
+ 	casgstatus(gp, _Gsyscall, _Grunnable)
+ 	dropg()
+ 	lock(&sched.lock)
+ 	_p_ := pidleget()
+ 	if _p_ == nil {
+ 		globrunqput(gp)
+ 	} else if atomicload(&sched.sysmonwait) != 0 {
+ 		atomicstore(&sched.sysmonwait, 0)
+ 		notewakeup(&sched.sysmonnote)
+ 	}
+ 	unlock(&sched.lock)
+ 	if _p_ != nil {
+ 		acquirep(_p_)
+ 		execute(gp) // Never returns.
+ 	}
+ 	if _g_.m.lockedg != nil {
+ 		// Wait until another thread schedules gp and so m again.
+ 		stoplockedm()
+ 		execute(gp) // Never returns.
+ 	}
+ 	stopm()
+ 	schedule() // Never returns.
+ }
+ 
+ func beforefork() {
+ 	gp := getg().m.curg
+ 
+ 	// Fork can hang if preempted with signals frequently enough (see issue 5517).
+ 	// Ensure that we stay on the same M where we disable profiling.
+ 	gp.m.locks++
+ 	if gp.m.profilehz != 0 {
+ 		resetcpuprofiler(0)
+ 	}
+ 
+ 	// This function is called before fork in syscall package.
+ 	// Code between fork and exec must not allocate memory nor even try to grow stack.
+ 	// Here we spoil g->_StackGuard to reliably detect any attempts to grow stack.
+ 	// runtime_AfterFork will undo this in parent process, but not in child.
+ 	gp.stackguard0 = stackFork
+ }
+ 
+ // Called from syscall package before fork.
+ //go:nosplit
+ func syscall_BeforeFork() {
+ 	systemstack(beforefork)
+ }
+ 
+ func afterfork() {
+ 	gp := getg().m.curg
+ 
+ 	// See the comment in beforefork.
+ 	gp.stackguard0 = gp.stack.lo + _StackGuard
+ 
+ 	hz := sched.profilehz
+ 	if hz != 0 {
+ 		resetcpuprofiler(hz)
+ 	}
+ 	gp.m.locks--
+ }
+ 
+ // Called from syscall package after fork in parent.
+ //go:nosplit
+ func syscall_AfterFork() {
+ 	systemstack(afterfork)
+ }
+ 
+ // Allocate a new g, with a stack big enough for stacksize bytes.
+ func malg(stacksize int32) *g {
+ 	newg := allocg()
+ 	if stacksize >= 0 {
+ 		stacksize = round2(_StackSystem + stacksize)
+ 		systemstack(func() {
+ 			newg.stack = stackalloc(uint32(stacksize))
+ 		})
+ 		newg.stackguard0 = newg.stack.lo + _StackGuard
+ 		newg.stackguard1 = ^uintptr(0)
+ 	}
+ 	return newg
+ }
+ 
+ // Create a new g running fn with siz bytes of arguments.
+ // Put it on the queue of g's waiting to run.
+ // The compiler turns a go statement into a call to this.
+ // Cannot split the stack because it assumes that the arguments
+ // are available sequentially after &fn; they would not be
+ // copied if a stack split occurred.
+ //go:nosplit
+ func newproc(siz int32, fn *funcval) {
+ 	argp := add(unsafe.Pointer(&fn), ptrSize)
+ 	if hasLinkRegister {
+ 		argp = add(argp, ptrSize) // skip caller's saved LR
+ 	}
+ 
+ 	pc := getcallerpc(unsafe.Pointer(&siz))
+ 	systemstack(func() {
+ 		newproc1(fn, (*uint8)(argp), siz, 0, pc)
+ 	})
+ }
+ 
+ // Create a new g running fn with narg bytes of arguments starting
+ // at argp and returning nret bytes of results.  callerpc is the
+ // address of the go statement that created this.  The new g is put
+ // on the queue of g's waiting to run.
+ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr) *g {
+ 	_g_ := getg()
+ 
+ 	if fn == nil {
+ 		_g_.m.throwing = -1 // do not dump full stacks
+ 		gothrow("go of nil func value")
+ 	}
+ 	_g_.m.locks++ // disable preemption because it can be holding p in a local var
+ 	siz := narg + nret
+ 	siz = (siz + 7) &^ 7
+ 
+ 	// We could allocate a larger initial stack if necessary.
+ 	// Not worth it: this is almost always an error.
+ 	// 4*sizeof(uintreg): extra space added below
+ 	// sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
+ 	if siz >= _StackMin-4*regSize-regSize {
+ 		gothrow("newproc: function arguments too large for new goroutine")
+ 	}
+ 
+ 	_p_ := _g_.m.p
+ 	newg := gfget(_p_)
+ 	if newg == nil {
+ 		newg = malg(_StackMin)
+ 		casgstatus(newg, _Gidle, _Gdead)
+ 		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
+ 	}
+ 	if newg.stack.hi == 0 {
+ 		gothrow("newproc1: newg missing stack")
+ 	}
+ 
+ 	if readgstatus(newg) != _Gdead {
+ 		gothrow("newproc1: new g is not Gdead")
+ 	}
+ 
+ 	sp := newg.stack.hi
+ 	sp -= 4 * regSize // extra space in case of reads slightly beyond frame
+ 	sp -= uintptr(siz)
+ 	memmove(unsafe.Pointer(sp), unsafe.Pointer(argp), uintptr(narg))
+ 	if hasLinkRegister {
+ 		// caller's LR
+ 		sp -= ptrSize
+ 		*(*unsafe.Pointer)(unsafe.Pointer(sp)) = nil
+ 	}
+ 
+ 	memclr(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
+ 	newg.sched.sp = sp
+ 	newg.sched.pc = funcPC(goexit) + _PCQuantum // +PCQuantum so that previous instruction is in same function
+ 	newg.sched.g = newg
+ 	gostartcallfn(&newg.sched, fn)
+ 	newg.gopc = callerpc
+ 	casgstatus(newg, _Gdead, _Grunnable)
+ 
+ 	if _p_.goidcache == _p_.goidcacheend {
+ 		// Sched.goidgen is the last allocated id,
+ 		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
+ 		// At startup sched.goidgen=0, so main goroutine receives goid=1.
+ 		_p_.goidcache = xadd64(&sched.goidgen, _GoidCacheBatch)
+ 		_p_.goidcache -= _GoidCacheBatch - 1
+ 		_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
+ 	}
+ 	newg.goid = int64(_p_.goidcache)
+ 	_p_.goidcache++
+ 	if raceenabled {
+ 		newg.racectx = racegostart(callerpc)
+ 	}
+ 	runqput(_p_, newg)
+ 
+ 	if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 && unsafe.Pointer(fn.fn) != unsafe.Pointer(funcPC(main)) { // TODO: fast atomic
+ 		wakep()
+ 	}
+ 	_g_.m.locks--
+ 	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+ 		_g_.stackguard0 = stackPreempt
+ 	}
+ 	return newg
+ }
+ 
+ // Put on gfree list.
+ // If local list is too long, transfer a batch to the global list.
+ func gfput(_p_ *p, gp *g) {
+ 	if readgstatus(gp) != _Gdead {
+ 		gothrow("gfput: bad status (not Gdead)")
+ 	}
+ 
+ 	stksize := gp.stack.hi - gp.stack.lo
+ 
+ 	if stksize != _FixedStack {
+ 		// non-standard stack size - free it.
+ 		stackfree(gp.stack)
+ 		gp.stack.lo = 0
+ 		gp.stack.hi = 0
+ 		gp.stackguard0 = 0
+ 	}
+ 
+ 	gp.schedlink = _p_.gfree
+ 	_p_.gfree = gp
+ 	_p_.gfreecnt++
+ 	if _p_.gfreecnt >= 64 {
+ 		lock(&sched.gflock)
+ 		for _p_.gfreecnt >= 32 {
+ 			_p_.gfreecnt--
+ 			gp = _p_.gfree
+ 			_p_.gfree = gp.schedlink
+ 			gp.schedlink = sched.gfree
+ 			sched.gfree = gp
+ 			sched.ngfree++
+ 		}
+ 		unlock(&sched.gflock)
+ 	}
+ }
+ 
+ // Get from gfree list.
+ // If local list is empty, grab a batch from global list.
+ func gfget(_p_ *p) *g {
+ retry:
+ 	gp := _p_.gfree
+ 	if gp == nil && sched.gfree != nil {
+ 		lock(&sched.gflock)
+ 		for _p_.gfreecnt < 32 && sched.gfree != nil {
+ 			_p_.gfreecnt++
+ 			gp = sched.gfree
+ 			sched.gfree = gp.schedlink
+ 			sched.ngfree--
+ 			gp.schedlink = _p_.gfree
+ 			_p_.gfree = gp
+ 		}
+ 		unlock(&sched.gflock)
+ 		goto retry
+ 	}
+ 	if gp != nil {
+ 		_p_.gfree = gp.schedlink
+ 		_p_.gfreecnt--
+ 		if gp.stack.lo == 0 {
+ 			// Stack was deallocated in gfput.  Allocate a new one.
+ 			systemstack(func() {
+ 				gp.stack = stackalloc(_FixedStack)
+ 			})
+ 			gp.stackguard0 = gp.stack.lo + _StackGuard
+ 		} else {
+ 			if raceenabled {
+ 				racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
+ 			}
+ 		}
+ 	}
+ 	return gp
+ }
+ 
+ // Purge all cached G's from gfree list to the global list.
+ func gfpurge(_p_ *p) {
+ 	lock(&sched.gflock)
+ 	for _p_.gfreecnt != 0 {
+ 		_p_.gfreecnt--
+ 		gp := _p_.gfree
+ 		_p_.gfree = gp.schedlink
+ 		gp.schedlink = sched.gfree
+ 		sched.gfree = gp
+ 		sched.ngfree++
+ 	}
+ 	unlock(&sched.gflock)
+ }
+ 
+ // Breakpoint executes a breakpoint trap.
+ func Breakpoint() {
+ 	breakpoint()
+ }
+ 
+ // dolockOSThread is called by LockOSThread and lockOSThread below
+ // after they modify m.locked. Do not allow preemption during this call,
+ // or else the m might be different in this function than in the caller.
+ //go:nosplit
+ func dolockOSThread() {
+ 	_g_ := getg()
+ 	_g_.m.lockedg = _g_
+ 	_g_.lockedm = _g_.m
+ }
+ 
+ //go:nosplit
+ 
+ // LockOSThread wires the calling goroutine to its current operating system thread.
+ // Until the calling goroutine exits or calls UnlockOSThread, it will always
+ // execute in that thread, and no other goroutine can.
+ func LockOSThread() {
+ 	getg().m.locked |= _LockExternal
+ 	dolockOSThread()
+ }
+ 
+ //go:nosplit
+ func lockOSThread() {
+ 	getg().m.locked += _LockInternal
+ 	dolockOSThread()
+ }
+ 
+ // dounlockOSThread is called by UnlockOSThread and unlockOSThread below
+ // after they update m->locked. Do not allow preemption during this call,
+ // or else the m might be in different in this function than in the caller.
+ //go:nosplit
+ func dounlockOSThread() {
+ 	_g_ := getg()
+ 	if _g_.m.locked != 0 {
+ 		return
+ 	}
+ 	_g_.m.lockedg = nil
+ 	_g_.lockedm = nil
+ }
+ 
+ //go:nosplit
+ 
+ // UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
+ // If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
+ func UnlockOSThread() {
+ 	getg().m.locked &^= _LockExternal
+ 	dounlockOSThread()
+ }
+ 
+ //go:nosplit
+ func unlockOSThread() {
+ 	_g_ := getg()
+ 	if _g_.m.locked < _LockInternal {
+ 		systemstack(badunlockosthread)
+ 	}
+ 	_g_.m.locked -= _LockInternal
+ 	dounlockOSThread()
+ }
+ 
+ func badunlockosthread() {
+ 	gothrow("runtime: internal error: misuse of lockOSThread/unlockOSThread")
+ }
+ 
+ func gcount() int32 {
+ 	n := int32(allglen) - sched.ngfree
+ 	for i := 0; ; i++ {
+ 		_p_ := allp[i]
+ 		if _p_ == nil {
+ 			break
+ 		}
+ 		n -= _p_.gfreecnt
+ 	}
+ 
+ 	// All these variables can be changed concurrently, so the result can be inconsistent.
+ 	// But at least the current goroutine is running.
+ 	if n < 1 {
+ 		n = 1
+ 	}
+ 	return n
+ }
+ 
+ func mcount() int32 {
+ 	return sched.mcount
+ }
+ 
+ var prof struct {
+ 	lock uint32
+ 	hz   int32
+ }
+ 
+ func _System()       { _System() }
+ func _ExternalCode() { _ExternalCode() }
+ func _GC()           { _GC() }
+ 
+ var etext struct{}
+ 
+ // Called if we receive a SIGPROF signal.
+ func sigprof(pc *uint8, sp *uint8, lr *uint8, gp *g, mp *m) {
+ 	var n int32
+ 	var traceback bool
+ 	var stk [100]uintptr
+ 
+ 	if prof.hz == 0 {
+ 		return
+ 	}
+ 
+ 	// Profiling runs concurrently with GC, so it must not allocate.
+ 	mp.mallocing++
+ 
+ 	// Define that a "user g" is a user-created goroutine, and a "system g"
+ 	// is one that is m->g0 or m->gsignal. We've only made sure that we
+ 	// can unwind user g's, so exclude the system g's.
+ 	//
+ 	// It is not quite as easy as testing gp == m->curg (the current user g)
+ 	// because we might be interrupted for profiling halfway through a
+ 	// goroutine switch. The switch involves updating three (or four) values:
+ 	// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
+ 	// because once it gets updated the new g is running.
+ 	//
+ 	// When switching from a user g to a system g, LR is not considered live,
+ 	// so the update only affects g, SP, and PC. Since PC must be last, there
+ 	// the possible partial transitions in ordinary execution are (1) g alone is updated,
+ 	// (2) both g and SP are updated, and (3) SP alone is updated.
+ 	// If g is updated, we'll see a system g and not look closer.
+ 	// If SP alone is updated, we can detect the partial transition by checking
+ 	// whether the SP is within g's stack bounds. (We could also require that SP
+ 	// be changed only after g, but the stack bounds check is needed by other
+ 	// cases, so there is no need to impose an additional requirement.)
+ 	//
+ 	// There is one exceptional transition to a system g, not in ordinary execution.
+ 	// When a signal arrives, the operating system starts the signal handler running
+ 	// with an updated PC and SP. The g is updated last, at the beginning of the
+ 	// handler. There are two reasons this is okay. First, until g is updated the
+ 	// g and SP do not match, so the stack bounds check detects the partial transition.
+ 	// Second, signal handlers currently run with signals disabled, so a profiling
+ 	// signal cannot arrive during the handler.
+ 	//
+ 	// When switching from a system g to a user g, there are three possibilities.
+ 	//
+ 	// First, it may be that the g switch has no PC update, because the SP
+ 	// either corresponds to a user g throughout (as in asmcgocall)
+ 	// or because it has been arranged to look like a user g frame
+ 	// (as in cgocallback_gofunc). In this case, since the entire
+ 	// transition is a g+SP update, a partial transition updating just one of
+ 	// those will be detected by the stack bounds check.
+ 	//
+ 	// Second, when returning from a signal handler, the PC and SP updates
+ 	// are performed by the operating system in an atomic update, so the g
+ 	// update must be done before them. The stack bounds check detects
+ 	// the partial transition here, and (again) signal handlers run with signals
+ 	// disabled, so a profiling signal cannot arrive then anyway.
+ 	//
+ 	// Third, the common case: it may be that the switch updates g, SP, and PC
+ 	// separately, as in gogo.
+ 	//
+ 	// Because gogo is the only instance, we check whether the PC lies
+ 	// within that function, and if so, not ask for a traceback. This approach
+ 	// requires knowing the size of the gogo function, which we
+ 	// record in arch_*.h and check in runtime_test.go.
+ 	//
+ 	// There is another apparently viable approach, recorded here in case
+ 	// the "PC within gogo" check turns out not to be usable.
+ 	// It would be possible to delay the update of either g or SP until immediately
+ 	// before the PC update instruction. Then, because of the stack bounds check,
+ 	// the only problematic interrupt point is just before that PC update instruction,
+ 	// and the sigprof handler can detect that instruction and simulate stepping past
+ 	// it in order to reach a consistent state. On ARM, the update of g must be made
+ 	// in two places (in R10 and also in a TLS slot), so the delayed update would
+ 	// need to be the SP update. The sigprof handler must read the instruction at
+ 	// the current PC and if it was the known instruction (for example, JMP BX or
+ 	// MOV R2, PC), use that other register in place of the PC value.
+ 	// The biggest drawback to this solution is that it requires that we can tell
+ 	// whether it's safe to read from the memory pointed at by PC.
+ 	// In a correct program, we can test PC == nil and otherwise read,
+ 	// but if a profiling signal happens at the instant that a program executes
+ 	// a bad jump (before the program manages to handle the resulting fault)
+ 	// the profiling handler could fault trying to read nonexistent memory.
+ 	//
+ 	// To recap, there are no constraints on the assembly being used for the
+ 	// transition. We simply require that g and SP match and that the PC is not
+ 	// in gogo.
+ 	traceback = true
+ 	usp := uintptr(unsafe.Pointer(sp))
+ 	gogo := funcPC(gogo)
+ 	if gp == nil || gp != mp.curg ||
+ 		usp < gp.stack.lo || gp.stack.hi < usp ||
+ 		(gogo <= uintptr(unsafe.Pointer(pc)) && uintptr(unsafe.Pointer(pc)) < gogo+_RuntimeGogoBytes) {
+ 		traceback = false
+ 	}
+ 
+ 	n = 0
+ 	if traceback {
+ 		n = int32(gentraceback(uintptr(unsafe.Pointer(pc)), uintptr(unsafe.Pointer(sp)), uintptr(unsafe.Pointer(lr)), gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap))
+ 	}
+ 	if !traceback || n <= 0 {
+ 		// Normal traceback is impossible or has failed.
+ 		// See if it falls into several common cases.
+ 		n = 0
+ 		if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
+ 			// Cgo, we can't unwind and symbolize arbitrary C code,
+ 			// so instead collect Go stack that leads to the cgo call.
+ 			// This is especially important on windows, since all syscalls are cgo calls.
+ 			n = int32(gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[0], len(stk), nil, nil, 0))
+ 		}
+ 		if GOOS == "windows" && n == 0 && mp.libcallg != nil && mp.libcallpc != 0 && mp.libcallsp != 0 {
+ 			// Libcall, i.e. runtime syscall on windows.
+ 			// Collect Go stack that leads to the call.
+ 			n = int32(gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg, 0, &stk[0], len(stk), nil, nil, 0))
+ 		}
+ 		if n == 0 {
+ 			// If all of the above has failed, account it against abstract "System" or "GC".
+ 			n = 2
+ 			// "ExternalCode" is better than "etext".
+ 			if uintptr(unsafe.Pointer(pc)) > uintptr(unsafe.Pointer(&etext)) {
+ 				pc = (*uint8)(unsafe.Pointer(uintptr(funcPC(_ExternalCode) + _PCQuantum)))
+ 			}
+ 			stk[0] = uintptr(unsafe.Pointer(pc))
+ 			if mp.gcing != 0 || mp.helpgc != 0 {
+ 				stk[1] = funcPC(_GC) + _PCQuantum
+ 			} else {
+ 				stk[1] = funcPC(_System) + _PCQuantum
+ 			}
+ 		}
+ 	}
+ 
+ 	if prof.hz != 0 {
+ 		// Simple cas-lock to coordinate with setcpuprofilerate.
+ 		for !cas(&prof.lock, 0, 1) {
+ 			osyield()
+ 		}
+ 		if prof.hz != 0 {
+ 			cpuproftick(&stk[0], n)
+ 		}
+ 		atomicstore(&prof.lock, 0)
+ 	}
+ 	mp.mallocing--
+ }
+ 
+ // Arrange to call fn with a traceback hz times a second.
+ func setcpuprofilerate_m(hz int32) {
+ 	// Force sane arguments.
+ 	if hz < 0 {
+ 		hz = 0
+ 	}
+ 
+ 	// Disable preemption, otherwise we can be rescheduled to another thread
+ 	// that has profiling enabled.
+ 	_g_ := getg()
+ 	_g_.m.locks++
+ 
+ 	// Stop profiler on this thread so that it is safe to lock prof.
+ 	// if a profiling signal came in while we had prof locked,
+ 	// it would deadlock.
+ 	resetcpuprofiler(0)
+ 
+ 	for !cas(&prof.lock, 0, 1) {
+ 		osyield()
+ 	}
+ 	prof.hz = hz
+ 	atomicstore(&prof.lock, 0)
+ 
+ 	lock(&sched.lock)
+ 	sched.profilehz = hz
+ 	unlock(&sched.lock)
+ 
+ 	if hz != 0 {
+ 		resetcpuprofiler(hz)
+ 	}
+ 
+ 	_g_.m.locks--
+ }
+ 
+ // Change number of processors.  The world is stopped, sched is locked.
++// gcworkbufs are not being modified by either the GC or
++// the write barrier code.
+ func procresize(new int32) {
+ 	old := gomaxprocs
+ 	if old < 0 || old > _MaxGomaxprocs || new <= 0 || new > _MaxGomaxprocs {
+ 		gothrow("procresize: invalid arg")
+ 	}
+ 
+ 	// initialize new P's
+ 	for i := int32(0); i < new; i++ {
+ 		p := allp[i]
+ 		if p == nil {
+ 			p = newP()
+ 			p.id = i
+ 			p.status = _Pgcstop
+ 			atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(p))
+ 		}
+ 		if p.mcache == nil {
+ 			if old == 0 && i == 0 {
+ 				if getg().m.mcache == nil {
+ 					gothrow("missing mcache?")
+ 				}
+ 				p.mcache = getg().m.mcache // bootstrap
+ 			} else {
+ 				p.mcache = allocmcache()
+ 			}
+ 		}
+ 	}
+ 
+ 	// redistribute runnable G's evenly
+ 	// collect all runnable goroutines in global queue preserving FIFO order
+ 	// FIFO order is required to ensure fairness even during frequent GCs
+ 	// see http://golang.org/issue/7126
+ 	empty := false
+ 	for !empty {
+ 		empty = true
+ 		for i := int32(0); i < old; i++ {
+ 			p := allp[i]
+ 			if p.runqhead == p.runqtail {
+ 				continue
+ 			}
+ 			empty = false
+ 			// pop from tail of local queue
+ 			p.runqtail--
+ 			gp := p.runq[p.runqtail%uint32(len(p.runq))]
+ 			// push onto head of global queue
+ 			gp.schedlink = sched.runqhead
+ 			sched.runqhead = gp
+ 			if sched.runqtail == nil {
+ 				sched.runqtail = gp
+ 			}
+ 			sched.runqsize++
+ 		}
+ 	}
+ 
+ 	// fill local queues with at most len(p.runq)/2 goroutines
+ 	// start at 1 because current M already executes some G and will acquire allp[0] below,
+ 	// so if we have a spare G we want to put it into allp[1].
+ 	var _p_ p
+ 	for i := int32(1); i < new*int32(len(_p_.runq))/2 && sched.runqsize > 0; i++ {
+ 		gp := sched.runqhead
+ 		sched.runqhead = gp.schedlink
+ 		if sched.runqhead == nil {
+ 			sched.runqtail = nil
+ 		}
+ 		sched.runqsize--
+ 		runqput(allp[i%new], gp)
+ 	}
+ 
+ 	// free unused P's
+ 	for i := new; i < old; i++ {
+ 		p := allp[i]
+ 		freemcache(p.mcache)
+ 		p.mcache = nil
+ 		gfpurge(p)
+ 		p.status = _Pdead
+ 		// can't free P itself because it can be referenced by an M in syscall
+ 	}
+ 
+ 	_g_ := getg()
+ 	if _g_.m.p != nil {
+ 		_g_.m.p.m = nil
+ 	}
+ 	_g_.m.p = nil
+ 	_g_.m.mcache = nil
+ 	p := allp[0]
+ 	p.m = nil
+ 	p.status = _Pidle
+ 	acquirep(p)
+ 	for i := new - 1; i > 0; i-- {
+ 		p := allp[i]
+ 		p.status = _Pidle
+ 		pidleput(p)
+ 	}
+ 	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
+ 	atomicstore((*uint32)(unsafe.Pointer(int32p)), uint32(new))
+ }
+ 
+ // Associate p and the current m.
+ func acquirep(_p_ *p) {
+ 	_g_ := getg()
+ 
+ 	if _g_.m.p != nil || _g_.m.mcache != nil {
+ 		gothrow("acquirep: already in go")
+ 	}
+ 	if _p_.m != nil || _p_.status != _Pidle {
+ 		id := int32(0)
+ 		if _p_.m != nil {
+ 			id = _p_.m.id
+ 		}
+ 		print("acquirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
+ 		gothrow("acquirep: invalid p state")
+ 	}
+ 	_g_.m.mcache = _p_.mcache
+ 	_g_.m.p = _p_
+ 	_p_.m = _g_.m
+ 	_p_.status = _Prunning
+ }
+ 
+ // Disassociate p and the current m.
+ func releasep() *p {
+ 	_g_ := getg()
+ 
+ 	if _g_.m.p == nil || _g_.m.mcache == nil {
+ 		gothrow("releasep: invalid arg")
+ 	}
+ 	_p_ := _g_.m.p
+ 	if _p_.m != _g_.m || _p_.mcache != _g_.m.mcache || _p_.status != _Prunning {
+ 		print("releasep: m=", _g_.m, " m->p=", _g_.m.p, " p->m=", _p_.m, " m->mcache=", _g_.m.mcache, " p->mcache=", _p_.mcache, " p->status=", _p_.status, "\n")
+ 		gothrow("releasep: invalid p state")
+ 	}
+ 	_g_.m.p = nil
+ 	_g_.m.mcache = nil
+ 	_p_.m = nil
+ 	_p_.status = _Pidle
+ 	return _p_
+ }
+ 
+ func incidlelocked(v int32) {
+ 	lock(&sched.lock)
+ 	sched.nmidlelocked += v
+ 	if v > 0 {
+ 		checkdead()
+ 	}
+ 	unlock(&sched.lock)
+ }
+ 
+ // Check for deadlock situation.
+ // The check is based on number of running M's, if 0 -> deadlock.
+ func checkdead() {
+ 	// If we are dying because of a signal caught on an already idle thread,
+ 	// freezetheworld will cause all running threads to block.
+ 	// And runtime will essentially enter into deadlock state,
+ 	// except that there is a thread that will call exit soon.
+ 	if panicking > 0 {
+ 		return
+ 	}
+ 
+ 	// -1 for sysmon
+ 	run := sched.mcount - sched.nmidle - sched.nmidlelocked - 1
+ 	if run > 0 {
+ 		return
+ 	}
+ 	if run < 0 {
+ 		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", sched.mcount, "\n")
+ 		gothrow("checkdead: inconsistent counts")
+ 	}
+ 
+ 	grunning := 0
+ 	lock(&allglock)
+ 	for i := 0; i < len(allgs); i++ {
+ 		gp := allgs[i]
+ 		if gp.issystem {
+ 			continue
+ 		}
+ 		s := readgstatus(gp)
+ 		switch s &^ _Gscan {
+ 		case _Gwaiting:
+ 			grunning++
+ 		case _Grunnable,
+ 			_Grunning,
+ 			_Gsyscall:
+ 			unlock(&allglock)
+ 			print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
+ 			gothrow("checkdead: runnable g")
+ 		}
+ 	}
+ 	unlock(&allglock)
+ 	if grunning == 0 { // possible if main goroutine calls runtimeÂ·Goexit()
+ 		gothrow("no goroutines (main called runtime.Goexit) - deadlock!")
+ 	}
+ 
+ 	// Maybe jump time forward for playground.
+ 	gp := timejump()
+ 	if gp != nil {
+ 		casgstatus(gp, _Gwaiting, _Grunnable)
+ 		globrunqput(gp)
+ 		_p_ := pidleget()
+ 		if _p_ == nil {
+ 			gothrow("checkdead: no p for timer")
+ 		}
+ 		mp := mget()
+ 		if mp == nil {
+ 			_newm(nil, _p_)
+ 		} else {
+ 			mp.nextp = _p_
+ 			notewakeup(&mp.park)
+ 		}
+ 		return
+ 	}
+ 
+ 	getg().m.throwing = -1 // do not dump full stacks
+ 	gothrow("all goroutines are asleep - deadlock!")
+ }
+ 
+ func sysmon() {
+ 	// If we go two minutes without a garbage collection, force one to run.
+ 	forcegcperiod := int64(2 * 60 * 1e9)
+ 
+ 	// If a heap span goes unused for 5 minutes after a garbage collection,
+ 	// we hand it back to the operating system.
+ 	scavengelimit := int64(5 * 60 * 1e9)
+ 
+ 	if debug.scavenge > 0 {
+ 		// Scavenge-a-lot for testing.
+ 		forcegcperiod = 10 * 1e6
+ 		scavengelimit = 20 * 1e6
+ 	}
+ 
+ 	lastscavenge := nanotime()
+ 	nscavenge := 0
+ 
+ 	// Make wake-up period small enough for the sampling to be correct.
+ 	maxsleep := forcegcperiod / 2
+ 	if scavengelimit < forcegcperiod {
+ 		maxsleep = scavengelimit / 2
+ 	}
+ 
+ 	lasttrace := int64(0)
+ 	idle := 0 // how many cycles in succession we had not wokeup somebody
+ 	delay := uint32(0)
+ 	for {
+ 		if idle == 0 { // start with 20us sleep...
+ 			delay = 20
+ 		} else if idle > 50 { // start doubling the sleep after 1ms...
+ 			delay *= 2
+ 		}
+ 		if delay > 10*1000 { // up to 10ms
+ 			delay = 10 * 1000
+ 		}
+ 		usleep(delay)
+ 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomicload(&sched.npidle) == uint32(gomaxprocs)) { // TODO: fast atomic
+ 			lock(&sched.lock)
+ 			if atomicload(&sched.gcwaiting) != 0 || atomicload(&sched.npidle) == uint32(gomaxprocs) {
+ 				atomicstore(&sched.sysmonwait, 1)
+ 				unlock(&sched.lock)
+ 				notetsleep(&sched.sysmonnote, maxsleep)
+ 				lock(&sched.lock)
+ 				atomicstore(&sched.sysmonwait, 0)
+ 				noteclear(&sched.sysmonnote)
+ 				idle = 0
+ 				delay = 20
+ 			}
+ 			unlock(&sched.lock)
+ 		}
+ 		// poll network if not polled for more than 10ms
+ 		lastpoll := int64(atomicload64(&sched.lastpoll))
+ 		now := nanotime()
+ 		unixnow := unixnanotime()
+ 		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
+ 			cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
+ 			gp := netpoll(false) // non-blocking - returns list of goroutines
+ 			if gp != nil {
+ 				// Need to decrement number of idle locked M's
+ 				// (pretending that one more is running) before injectglist.
+ 				// Otherwise it can lead to the following situation:
+ 				// injectglist grabs all P's but before it starts M's to run the P's,
+ 				// another M returns from syscall, finishes running its G,
+ 				// observes that there is no work to do and no other running M's
+ 				// and reports deadlock.
+ 				incidlelocked(-1)
+ 				injectglist(gp)
+ 				incidlelocked(1)
+ 			}
+ 		}
+ 		// retake P's blocked in syscalls
+ 		// and preempt long running G's
+ 		if retake(now) != 0 {
+ 			idle = 0
+ 		} else {
+ 			idle++
+ 		}
+ 		// check if we need to force a GC
+ 		lastgc := int64(atomicload64(&memstats.last_gc))
+ 		if lastgc != 0 && unixnow-lastgc > forcegcperiod && atomicload(&forcegc.idle) != 0 {
+ 			lock(&forcegc.lock)
+ 			forcegc.idle = 0
+ 			forcegc.g.schedlink = nil
+ 			injectglist(forcegc.g)
+ 			unlock(&forcegc.lock)
+ 		}
+ 		// scavenge heap once in a while
+ 		if lastscavenge+scavengelimit/2 < now {
+ 			mHeap_Scavenge(int32(nscavenge), uint64(now), uint64(scavengelimit))
+ 			lastscavenge = now
+ 			nscavenge++
+ 		}
+ 		if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace*1000000) <= now {
+ 			lasttrace = now
+ 			schedtrace(debug.scheddetail > 0)
+ 		}
+ 	}
+ }
+ 
+ var pdesc [_MaxGomaxprocs]struct {
+ 	schedtick   uint32
+ 	schedwhen   int64
+ 	syscalltick uint32
+ 	syscallwhen int64
+ }
+ 
+ func retake(now int64) uint32 {
+ 	n := 0
+ 	for i := int32(0); i < gomaxprocs; i++ {
+ 		_p_ := allp[i]
+ 		if _p_ == nil {
+ 			continue
+ 		}
+ 		pd := &pdesc[i]
+ 		s := _p_.status
+ 		if s == _Psyscall {
+ 			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
+ 			t := int64(_p_.syscalltick)
+ 			if int64(pd.syscalltick) != t {
+ 				pd.syscalltick = uint32(t)
+ 				pd.syscallwhen = now
+ 				continue
+ 			}
+ 			// On the one hand we don't want to retake Ps if there is no other work to do,
+ 			// but on the other hand we want to retake them eventually
+ 			// because they can prevent the sysmon thread from deep sleep.
+ 			if _p_.runqhead == _p_.runqtail && atomicload(&sched.nmspinning)+atomicload(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
+ 				continue
+ 			}
+ 			// Need to decrement number of idle locked M's
+ 			// (pretending that one more is running) before the CAS.
+ 			// Otherwise the M from which we retake can exit the syscall,
+ 			// increment nmidle and report deadlock.
+ 			incidlelocked(-1)
+ 			if cas(&_p_.status, s, _Pidle) {
+ 				n++
+ 				handoffp(_p_)
+ 			}
+ 			incidlelocked(1)
+ 		} else if s == _Prunning {
+ 			// Preempt G if it's running for more than 10ms.
+ 			t := int64(_p_.schedtick)
+ 			if int64(pd.schedtick) != t {
+ 				pd.schedtick = uint32(t)
+ 				pd.schedwhen = now
+ 				continue
+ 			}
+ 			if pd.schedwhen+10*1000*1000 > now {
+ 				continue
+ 			}
+ 			preemptone(_p_)
+ 		}
+ 	}
+ 	return uint32(n)
+ }
+ 
+ // Tell all goroutines that they have been preempted and they should stop.
+ // This function is purely best-effort.  It can fail to inform a goroutine if a
+ // processor just started running it.
+ // No locks need to be held.
+ // Returns true if preemption request was issued to at least one goroutine.
+ func preemptall() bool {
+ 	res := false
+ 	for i := int32(0); i < gomaxprocs; i++ {
+ 		_p_ := allp[i]
+ 		if _p_ == nil || _p_.status != _Prunning {
+ 			continue
+ 		}
+ 		if preemptone(_p_) {
+ 			res = true
+ 		}
+ 	}
+ 	return res
+ }
+ 
+ // Tell the goroutine running on processor P to stop.
+ // This function is purely best-effort.  It can incorrectly fail to inform the
+ // goroutine.  It can send inform the wrong goroutine.  Even if it informs the
+ // correct goroutine, that goroutine might ignore the request if it is
+ // simultaneously executing newstack.
+ // No lock needs to be held.
+ // Returns true if preemption request was issued.
+ // The actual preemption will happen at some point in the future
+ // and will be indicated by the gp->status no longer being
+ // Grunning
+ func preemptone(_p_ *p) bool {
+ 	mp := _p_.m
+ 	if mp == nil || mp == getg().m {
+ 		return false
+ 	}
+ 	gp := mp.curg
+ 	if gp == nil || gp == mp.g0 {
+ 		return false
+ 	}
+ 
+ 	gp.preempt = true
+ 
+ 	// Every call in a go routine checks for stack overflow by
+ 	// comparing the current stack pointer to gp->stackguard0.
+ 	// Setting gp->stackguard0 to StackPreempt folds
+ 	// preemption into the normal stack overflow check.
+ 	gp.stackguard0 = stackPreempt
+ 	return true
+ }
+ 
+ var starttime int64
+ 
+ func schedtrace(detailed bool) {
+ 	now := nanotime()
+ 	if starttime == 0 {
+ 		starttime = now
+ 	}
+ 
+ 	lock(&sched.lock)
+ 	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", sched.mcount, " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+ 	if detailed {
+ 		print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
+ 	}
+ 	// We must be careful while reading data from P's, M's and G's.
+ 	// Even if we hold schedlock, most data can be changed concurrently.
+ 	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
+ 	for i := int32(0); i < gomaxprocs; i++ {
+ 		_p_ := allp[i]
+ 		if _p_ == nil {
+ 			continue
+ 		}
+ 		mp := _p_.m
+ 		h := atomicload(&_p_.runqhead)
+ 		t := atomicload(&_p_.runqtail)
+ 		if detailed {
+ 			id := int32(-1)
+ 			if mp != nil {
+ 				id = mp.id
+ 			}
+ 			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gfreecnt, "\n")
+ 		} else {
+ 			// In non-detailed mode format lengths of per-P run queues as:
+ 			// [len1 len2 len3 len4]
+ 			print(" ")
+ 			if i == 0 {
+ 				print("[")
+ 			}
+ 			print(t - h)
+ 			if i == gomaxprocs-1 {
+ 				print("]\n")
+ 			}
+ 		}
+ 	}
+ 
+ 	if !detailed {
+ 		unlock(&sched.lock)
+ 		return
+ 	}
+ 
+ 	for mp := allm; mp != nil; mp = mp.alllink {
+ 		_p_ := mp.p
+ 		gp := mp.curg
+ 		lockedg := mp.lockedg
+ 		id1 := int32(-1)
+ 		if _p_ != nil {
+ 			id1 = _p_.id
+ 		}
+ 		id2 := int64(-1)
+ 		if gp != nil {
+ 			id2 = gp.goid
+ 		}
+ 		id3 := int64(-1)
+ 		if lockedg != nil {
+ 			id3 = lockedg.goid
+ 		}
+ 		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " gcing=", mp.gcing, ""+" locks=", mp.locks, " dying=", mp.dying, " helpgc=", mp.helpgc, " spinning=", mp.spinning, " blocked=", getg().m.blocked, " lockedg=", id3, "\n")
+ 	}
+ 
+ 	lock(&allglock)
+ 	for gi := 0; gi < len(allgs); gi++ {
+ 		gp := allgs[gi]
+ 		mp := gp.m
+ 		lockedm := gp.lockedm
+ 		id1 := int32(-1)
+ 		if mp != nil {
+ 			id1 = mp.id
+ 		}
+ 		id2 := int32(-1)
+ 		if lockedm != nil {
+ 			id2 = lockedm.id
+ 		}
+ 		print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason, ") m=", id1, " lockedm=", id2, "\n")
+ 	}
+ 	unlock(&allglock)
+ 	unlock(&sched.lock)
+ }
+ 
+ // Put mp on midle list.
+ // Sched must be locked.
+ func mput(mp *m) {
+ 	mp.schedlink = sched.midle
+ 	sched.midle = mp
+ 	sched.nmidle++
+ 	checkdead()
+ }
+ 
+ // Try to get an m from midle list.
+ // Sched must be locked.
+ func mget() *m {
+ 	mp := sched.midle
+ 	if mp != nil {
+ 		sched.midle = mp.schedlink
+ 		sched.nmidle--
+ 	}
+ 	return mp
+ }
+ 
+ // Put gp on the global runnable queue.
+ // Sched must be locked.
+ func globrunqput(gp *g) {
+ 	gp.schedlink = nil
+ 	if sched.runqtail != nil {
+ 		sched.runqtail.schedlink = gp
+ 	} else {
+ 		sched.runqhead = gp
+ 	}
+ 	sched.runqtail = gp
+ 	sched.runqsize++
+ }
+ 
+ // Put a batch of runnable goroutines on the global runnable queue.
+ // Sched must be locked.
+ func globrunqputbatch(ghead *g, gtail *g, n int32) {
+ 	gtail.schedlink = nil
+ 	if sched.runqtail != nil {
+ 		sched.runqtail.schedlink = ghead
+ 	} else {
+ 		sched.runqhead = ghead
+ 	}
+ 	sched.runqtail = gtail
+ 	sched.runqsize += n
+ }
+ 
+ // Try get a batch of G's from the global runnable queue.
+ // Sched must be locked.
+ func globrunqget(_p_ *p, max int32) *g {
+ 	if sched.runqsize == 0 {
+ 		return nil
+ 	}
+ 
+ 	n := sched.runqsize/gomaxprocs + 1
+ 	if n > sched.runqsize {
+ 		n = sched.runqsize
+ 	}
+ 	if max > 0 && n > max {
+ 		n = max
+ 	}
+ 	if n > int32(len(_p_.runq))/2 {
+ 		n = int32(len(_p_.runq)) / 2
+ 	}
+ 
+ 	sched.runqsize -= n
+ 	if sched.runqsize == 0 {
+ 		sched.runqtail = nil
+ 	}
+ 
+ 	gp := sched.runqhead
+ 	sched.runqhead = gp.schedlink
+ 	n--
+ 	for ; n > 0; n-- {
+ 		gp1 := sched.runqhead
+ 		sched.runqhead = gp1.schedlink
+ 		runqput(_p_, gp1)
+ 	}
+ 	return gp
+ }
+ 
+ // Put p to on _Pidle list.
+ // Sched must be locked.
+ func pidleput(_p_ *p) {
+ 	_p_.link = sched.pidle
+ 	sched.pidle = _p_
+ 	xadd(&sched.npidle, 1) // TODO: fast atomic
+ }
+ 
+ // Try get a p from _Pidle list.
+ // Sched must be locked.
+ func pidleget() *p {
+ 	_p_ := sched.pidle
+ 	if _p_ != nil {
+ 		sched.pidle = _p_.link
+ 		xadd(&sched.npidle, -1) // TODO: fast atomic
+ 	}
+ 	return _p_
+ }
+ 
+ // Try to put g on local runnable queue.
+ // If it's full, put onto global queue.
+ // Executed only by the owner P.
+ func runqput(_p_ *p, gp *g) {
+ retry:
+ 	h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
+ 	t := _p_.runqtail
+ 	if t-h < uint32(len(_p_.runq)) {
+ 		_p_.runq[t%uint32(len(_p_.runq))] = gp
+ 		atomicstore(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
+ 		return
+ 	}
+ 	if runqputslow(_p_, gp, h, t) {
+ 		return
+ 	}
+ 	// the queue is not full, now the put above must suceed
+ 	goto retry
+ }
+ 
+ // Put g and a batch of work from local runnable queue on global queue.
+ // Executed only by the owner P.
+ func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
+ 	var batch [len(_p_.runq)/2 + 1]*g
+ 
+ 	// First, grab a batch from local queue.
+ 	n := t - h
+ 	n = n / 2
+ 	if n != uint32(len(_p_.runq)/2) {
+ 		gothrow("runqputslow: queue is not full")
+ 	}
+ 	for i := uint32(0); i < n; i++ {
+ 		batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))]
+ 	}
+ 	if !cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+ 		return false
+ 	}
+ 	batch[n] = gp
+ 
+ 	// Link the goroutines.
+ 	for i := uint32(0); i < n; i++ {
+ 		batch[i].schedlink = batch[i+1]
+ 	}
+ 
+ 	// Now put the batch on global queue.
+ 	lock(&sched.lock)
+ 	globrunqputbatch(batch[0], batch[n], int32(n+1))
+ 	unlock(&sched.lock)
+ 	return true
+ }
+ 
+ // Get g from local runnable queue.
+ // Executed only by the owner P.
+ func runqget(_p_ *p) *g {
+ 	for {
+ 		h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
+ 		t := _p_.runqtail
+ 		if t == h {
+ 			return nil
+ 		}
+ 		gp := _p_.runq[h%uint32(len(_p_.runq))]
+ 		if cas(&_p_.runqhead, h, h+1) { // cas-release, commits consume
+ 			return gp
+ 		}
+ 	}
+ }
+ 
+ // Grabs a batch of goroutines from local runnable queue.
+ // batch array must be of size len(p->runq)/2. Returns number of grabbed goroutines.
+ // Can be executed by any P.
+ func runqgrab(_p_ *p, batch []*g) uint32 {
+ 	for {
+ 		h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
+ 		t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer
+ 		n := t - h
+ 		n = n - n/2
+ 		if n == 0 {
+ 			return 0
+ 		}
+ 		if n > uint32(len(_p_.runq)/2) { // read inconsistent h and t
+ 			continue
+ 		}
+ 		for i := uint32(0); i < n; i++ {
+ 			batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))]
+ 		}
+ 		if cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+ 			return n
+ 		}
+ 	}
+ }
+ 
+ // Steal half of elements from local runnable queue of p2
+ // and put onto local runnable queue of p.
+ // Returns one of the stolen elements (or nil if failed).
+ func runqsteal(_p_, p2 *p) *g {
+ 	var batch [len(_p_.runq) / 2]*g
+ 
+ 	n := runqgrab(p2, batch[:])
+ 	if n == 0 {
+ 		return nil
+ 	}
+ 	n--
+ 	gp := batch[n]
+ 	if n == 0 {
+ 		return gp
+ 	}
+ 	h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
+ 	t := _p_.runqtail
+ 	if t-h+n >= uint32(len(_p_.runq)) {
+ 		gothrow("runqsteal: runq overflow")
+ 	}
+ 	for i := uint32(0); i < n; i++ {
+ 		_p_.runq[(t+i)%uint32(len(_p_.runq))] = batch[i]
+ 	}
+ 	atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
+ 	return gp
+ }
+ 
+ func testSchedLocalQueue() {
+ 	_p_ := new(p)
+ 	gs := make([]g, len(_p_.runq))
+ 	for i := 0; i < len(_p_.runq); i++ {
+ 		if runqget(_p_) != nil {
+ 			gothrow("runq is not empty initially")
+ 		}
+ 		for j := 0; j < i; j++ {
+ 			runqput(_p_, &gs[i])
+ 		}
+ 		for j := 0; j < i; j++ {
+ 			if runqget(_p_) != &gs[i] {
+ 				print("bad element at iter ", i, "/", j, "\n")
+ 				gothrow("bad element")
+ 			}
+ 		}
+ 		if runqget(_p_) != nil {
+ 			gothrow("runq is not empty afterwards")
+ 		}
+ 	}
+ }
+ 
+ func testSchedLocalQueueSteal() {
+ 	p1 := new(p)
+ 	p2 := new(p)
+ 	gs := make([]g, len(p1.runq))
+ 	for i := 0; i < len(p1.runq); i++ {
+ 		for j := 0; j < i; j++ {
+ 			gs[j].sig = 0
+ 			runqput(p1, &gs[j])
+ 		}
+ 		gp := runqsteal(p2, p1)
+ 		s := 0
+ 		if gp != nil {
+ 			s++
+ 			gp.sig++
+ 		}
+ 		for {
+ 			gp = runqget(p2)
+ 			if gp == nil {
+ 				break
+ 			}
+ 			s++
+ 			gp.sig++
+ 		}
+ 		for {
+ 			gp = runqget(p1)
+ 			if gp == nil {
+ 				break
+ 			}
+ 			gp.sig++
+ 		}
+ 		for j := 0; j < i; j++ {
+ 			if gs[j].sig != 1 {
+ 				print("bad element ", j, "(", gs[j].sig, ") at iter ", i, "\n")
+ 				gothrow("bad element")
+ 			}
+ 		}
+ 		if s != i/2 && s != i/2+1 {
+ 			print("bad steal ", s, ", want ", i/2, " or ", i/2+1, ", iter ", i, "\n")
+ 			gothrow("bad steal")
+ 		}
+ 	}
+ }
+ 
+ func setMaxThreads(in int) (out int) {
+ 	lock(&sched.lock)
+ 	out = int(sched.maxmcount)
+ 	sched.maxmcount = int32(in)
+ 	checkmcount()
+ 	unlock(&sched.lock)
+ 	return
+ }
+ 
+ var goexperiment string = "GOEXPERIMENT" // TODO: defined in zaexperiment.h
+ 
+ func haveexperiment(name string) bool {
+ 	x := goexperiment
+ 	for x != "" {
+ 		xname := ""
+ 		i := index(x, ",")
+ 		if i < 0 {
+ 			xname, x = x, ""
+ 		} else {
+ 			xname, x = x[:i], x[i+1:]
+ 		}
+ 		if xname == name {
+ 			return true
+ 		}
+ 	}
+ 	return false
+ }
+ 
+ //go:nosplit
+ func sync_procPin() int {
+ 	_g_ := getg()
+ 	mp := _g_.m
+ 
+ 	mp.locks++
+ 	return int(mp.p.id)
+ }
+ 
+ //go:nosplit
+ func sync_procUnpin() {
+ 	_g_ := getg()
+ 	_g_.m.locks--
+ }
diff --cc src/runtime/rt0_linux_386.s
index 352e594d53,352e594d53..47fd908e78
--- a/src/runtime/rt0_linux_386.s
+++ b/src/runtime/rt0_linux_386.s
@@@ -9,7 -9,7 +9,6 @@@ TEXT _rt0_386_linux(SB),NOSPLIT,$
  	LEAL	12(SP), BX
  	MOVL	AX, 0(SP)
  	MOVL	BX, 4(SP)
--	CALL	runtimeÂ·linux_setup_vdso(SB)
  	CALL	main(SB)
  	INT	$3
  
diff --cc src/runtime/runtime2.go
index 0000000000,c999b3072d..7625a2dd81
mode 000000,100644..100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@@ -1,0 -1,608 +1,613 @@@
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ /*
+  * defined constants
+  */
+ const (
+ 	// G status
+ 	//
+ 	// If you add to this list, add to the list
+ 	// of "okay during garbage collection" status
+ 	// in mgc0.c too.
+ 	_Gidle            = iota // 0
+ 	_Grunnable               // 1 runnable and on a run queue
+ 	_Grunning                // 2
+ 	_Gsyscall                // 3
+ 	_Gwaiting                // 4
+ 	_Gmoribund_unused        // 5 currently unused, but hardcoded in gdb scripts
+ 	_Gdead                   // 6
+ 	_Genqueue                // 7 Only the Gscanenqueue is used.
+ 	_Gcopystack              // 8 in this state when newstack is moving the stack
+ 	// the following encode that the GC is scanning the stack and what to do when it is done
+ 	_Gscan = 0x1000 // atomicstatus&~Gscan = the non-scan state,
+ 	// _Gscanidle =     _Gscan + _Gidle,      // Not used. Gidle only used with newly malloced gs
+ 	_Gscanrunnable = _Gscan + _Grunnable //  0x1001 When scanning complets make Grunnable (it is already on run queue)
+ 	_Gscanrunning  = _Gscan + _Grunning  //  0x1002 Used to tell preemption newstack routine to scan preempted stack.
+ 	_Gscansyscall  = _Gscan + _Gsyscall  //  0x1003 When scanning completes make is Gsyscall
+ 	_Gscanwaiting  = _Gscan + _Gwaiting  //  0x1004 When scanning completes make it Gwaiting
+ 	// _Gscanmoribund_unused,               //  not possible
+ 	// _Gscandead,                          //  not possible
+ 	_Gscanenqueue = _Gscan + _Genqueue //  When scanning completes make it Grunnable and put on runqueue
+ )
+ 
+ const (
+ 	// P status
+ 	_Pidle = iota
+ 	_Prunning
+ 	_Psyscall
+ 	_Pgcstop
+ 	_Pdead
+ )
+ 
+ // XXX inserting below here
+ 
+ type mutex struct {
+ 	// Futex-based impl treats it as uint32 key,
+ 	// while sema-based impl as M* waitm.
+ 	// Used to be a union, but unions break precise GC.
+ 	key uintptr
+ }
+ 
+ type note struct {
+ 	// Futex-based impl treats it as uint32 key,
+ 	// while sema-based impl as M* waitm.
+ 	// Used to be a union, but unions break precise GC.
+ 	key uintptr
+ }
+ 
+ type _string struct {
+ 	str *byte
+ 	len int
+ }
+ 
+ type funcval struct {
+ 	fn uintptr
+ 	// variable-size, fn-specific data here
+ }
+ 
+ type iface struct {
+ 	tab  *itab
+ 	data unsafe.Pointer
+ }
+ 
+ type eface struct {
+ 	_type *_type
+ 	data  unsafe.Pointer
+ }
+ 
+ type slice struct {
+ 	array *byte // actual data
+ 	len   uint  // number of elements
+ 	cap   uint  // allocated number of elements
+ }
+ 
+ type gobuf struct {
+ 	// The offsets of sp, pc, and g are known to (hard-coded in) libmach.
+ 	sp   uintptr
+ 	pc   uintptr
+ 	g    *g
+ 	ctxt unsafe.Pointer // this has to be a pointer so that gc scans it
+ 	ret  uintreg
+ 	lr   uintptr
+ }
+ 
+ // Known to compiler.
+ // Changes here must also be made in src/cmd/gc/select.c's selecttype.
+ type sudog struct {
+ 	g           *g
+ 	selectdone  *uint32
+ 	next        *sudog
+ 	prev        *sudog
+ 	elem        unsafe.Pointer // data element
+ 	releasetime int64
+ 	nrelease    int32  // -1 for acquire
+ 	waitlink    *sudog // g.waiting list
+ }
+ 
+ type gcstats struct {
+ 	// the struct must consist of only uint64's,
+ 	// because it is casted to uint64[].
+ 	nhandoff    uint64
+ 	nhandoffcnt uint64
+ 	nprocyield  uint64
+ 	nosyield    uint64
+ 	nsleep      uint64
+ }
+ 
+ type libcall struct {
+ 	fn   uintptr
+ 	n    uintptr // number of parameters
+ 	args uintptr // parameters
+ 	r1   uintptr // return values
+ 	r2   uintptr
+ 	err  uintptr // error number
+ }
+ 
+ // describes how to handle callback
+ type wincallbackcontext struct {
+ 	gobody       unsafe.Pointer // go function to call
+ 	argsize      uintptr        // callback arguments size (in bytes)
+ 	restorestack uintptr        // adjust stack on return by (in bytes) (386 only)
+ 	cleanstack   bool
+ }
+ 
+ // Stack describes a Go execution stack.
+ // The bounds of the stack are exactly [lo, hi),
+ // with no implicit data structures on either side.
+ type stack struct {
+ 	lo uintptr
+ 	hi uintptr
+ }
+ 
+ type g struct {
+ 	// Stack parameters.
+ 	// stack describes the actual stack memory: [stack.lo, stack.hi).
+ 	// stackguard0 is the stack pointer compared in the Go stack growth prologue.
+ 	// It is stack.lo+StackGuard normally, but can be StackPreempt to trigger a preemption.
+ 	// stackguard1 is the stack pointer compared in the C stack growth prologue.
+ 	// It is stack.lo+StackGuard on g0 and gsignal stacks.
+ 	// It is ~0 on other goroutine stacks, to trigger a call to morestackc (and crash).
+ 	stack       stack   // offset known to runtime/cgo
+ 	stackguard0 uintptr // offset known to liblink
+ 	stackguard1 uintptr // offset known to liblink
+ 
+ 	_panic       *_panic // innermost panic - offset known to liblink
+ 	_defer       *_defer // innermost defer
+ 	sched        gobuf
+ 	syscallsp    uintptr        // if status==gsyscall, syscallsp = sched.sp to use during gc
+ 	syscallpc    uintptr        // if status==gsyscall, syscallpc = sched.pc to use during gc
+ 	param        unsafe.Pointer // passed parameter on wakeup
+ 	atomicstatus uint32
+ 	goid         int64
+ 	waitsince    int64  // approx time when the g become blocked
+ 	waitreason   string // if status==gwaiting
+ 	schedlink    *g
+ 	issystem     bool // do not output in stack dump, ignore in deadlock detector
+ 	preempt      bool // preemption signal, duplicates stackguard0 = stackpreempt
+ 	paniconfault bool // panic (instead of crash) on unexpected fault address
+ 	preemptscan  bool // preempted g does scan for gc
+ 	gcworkdone   bool // debug: cleared at begining of gc work phase cycle, set by gcphasework, tested at end of cycle
+ 	throwsplit   bool // must not split stack
+ 	raceignore   int8 // ignore race detection events
+ 	m            *m   // for debuggers, but offset not hard-coded
+ 	lockedm      *m
+ 	sig          uint32
+ 	writebuf     []byte
+ 	sigcode0     uintptr
+ 	sigcode1     uintptr
+ 	sigpc        uintptr
+ 	gopc         uintptr // pc of go statement that created this goroutine
+ 	racectx      uintptr
+ 	waiting      *sudog // sudog structures this g is waiting on (that have a valid elem ptr)
+ 	end          [0]byte
+ }
+ 
+ type mts struct {
+ 	tv_sec  int64
+ 	tv_nsec int64
+ }
+ 
+ type mscratch struct {
+ 	v [6]uintptr
+ }
+ 
+ type m struct {
+ 	g0      *g    // goroutine with scheduling stack
+ 	morebuf gobuf // gobuf arg to morestack
+ 
+ 	// Fields not known to debuggers.
+ 	procid        uint64         // for debuggers, but offset not hard-coded
+ 	gsignal       *g             // signal-handling g
+ 	tls           [4]uintptr     // thread-local storage (for x86 extern register)
+ 	mstartfn      unsafe.Pointer // todo go func()
+ 	curg          *g             // current running goroutine
+ 	caughtsig     *g             // goroutine running during fatal signal
+ 	p             *p             // attached p for executing go code (nil if not executing go code)
+ 	nextp         *p
+ 	id            int32
+ 	mallocing     int32
+ 	throwing      int32
+ 	gcing         int32
+ 	locks         int32
+ 	softfloat     int32
+ 	dying         int32
+ 	profilehz     int32
+ 	helpgc        int32
+ 	spinning      bool // m is out of work and is actively looking for work
+ 	blocked       bool // m is blocked on a note
++	inwb          bool // m is executing a write barrier
++	printlock     int8
+ 	fastrand      uint32
+ 	ncgocall      uint64 // number of cgo calls in total
+ 	ncgo          int32  // number of cgo calls currently in progress
+ 	cgomal        *cgomal
+ 	park          note
+ 	alllink       *m // on allm
+ 	schedlink     *m
+ 	machport      uint32 // return address for mach ipc (os x)
+ 	mcache        *mcache
+ 	lockedg       *g
+ 	createstack   [32]uintptr // stack that created this thread.
+ 	freglo        [16]uint32  // d[i] lsb and f[i]
+ 	freghi        [16]uint32  // d[i] msb and f[i+16]
+ 	fflag         uint32      // floating point compare flags
+ 	locked        uint32      // tracking for lockosthread
+ 	nextwaitm     *m          // next m waiting for lock
+ 	waitsema      uintptr     // semaphore for parking on locks
+ 	waitsemacount uint32
+ 	waitsemalock  uint32
+ 	gcstats       gcstats
+ 	needextram    bool
+ 	traceback     uint8
+ 	waitunlockf   unsafe.Pointer // todo go func(*g, unsafe.pointer) bool
+ 	waitlock      unsafe.Pointer
+ 	//#ifdef GOOS_windows
+ 	thread uintptr // thread handle
+ 	// these are here because they are too large to be on the stack
+ 	// of low-level NOSPLIT functions.
+ 	libcall   libcall
+ 	libcallpc uintptr // for cpu profiler
+ 	libcallsp uintptr
+ 	libcallg  *g
+ 	//#endif
+ 	//#ifdef GOOS_solaris
+ 	perrno *int32 // pointer to tls errno
+ 	// these are here because they are too large to be on the stack
+ 	// of low-level NOSPLIT functions.
+ 	//LibCall	libcall;
+ 	ts      mts
+ 	scratch mscratch
+ 	//#endif
+ 	//#ifdef GOOS_plan9
+ 	notesig *int8
+ 	errstr  *byte
+ 	//#endif
+ 	end [0]byte
+ }
+ 
+ type p struct {
+ 	lock mutex
+ 
+ 	id          int32
+ 	status      uint32 // one of pidle/prunning/...
+ 	link        *p
+ 	schedtick   uint32 // incremented on every scheduler call
+ 	syscalltick uint32 // incremented on every system call
+ 	m           *m     // back-link to associated m (nil if idle)
+ 	mcache      *mcache
+ 	deferpool   [5]*_defer // pool of available defer structs of different sizes (see panic.c)
+ 
+ 	// Cache of goroutine ids, amortizes accesses to runtimeÂ·sched.goidgen.
+ 	goidcache    uint64
+ 	goidcacheend uint64
+ 
+ 	// Queue of runnable goroutines.
+ 	runqhead uint32
+ 	runqtail uint32
+ 	runq     [256]*g
+ 
+ 	// Available G's (status == Gdead)
+ 	gfree    *g
+ 	gfreecnt int32
+ 
+ 	pad [64]byte
+ }
+ 
+ const (
+ 	// The max value of GOMAXPROCS.
+ 	// There are no fundamental restrictions on the value.
+ 	_MaxGomaxprocs = 1 << 8
+ )
+ 
+ type schedt struct {
+ 	lock mutex
+ 
+ 	goidgen uint64
+ 
+ 	midle        *m    // idle m's waiting for work
+ 	nmidle       int32 // number of idle m's waiting for work
+ 	nmidlelocked int32 // number of locked m's waiting for work
+ 	mcount       int32 // number of m's that have been created
+ 	maxmcount    int32 // maximum number of m's allowed (or die)
+ 
+ 	pidle      *p // idle p's
+ 	npidle     uint32
+ 	nmspinning uint32
+ 
+ 	// Global runnable queue.
+ 	runqhead *g
+ 	runqtail *g
+ 	runqsize int32
+ 
+ 	// Global cache of dead G's.
+ 	gflock mutex
+ 	gfree  *g
+ 	ngfree int32
+ 
+ 	gcwaiting  uint32 // gc is waiting to run
+ 	stopwait   int32
+ 	stopnote   note
+ 	sysmonwait uint32
+ 	sysmonnote note
+ 	lastpoll   uint64
+ 
+ 	profilehz int32 // cpu profiling rate
+ }
+ 
+ // The m->locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
+ // The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
+ // External locks are not recursive; a second lock is silently ignored.
+ // The upper bits of m->lockedcount record the nesting depth of calls to lockOSThread
+ // (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
+ // Internal locks can be recursive. For instance, a lock for cgo can occur while the main
+ // goroutine is holding the lock during the initialization phase.
+ const (
+ 	_LockExternal = 1
+ 	_LockInternal = 2
+ )
+ 
+ type sigtabtt struct {
+ 	flags int32
+ 	name  *int8
+ }
+ 
+ const (
+ 	_SigNotify   = 1 << 0 // let signal.Notify have signal, even if from kernel
+ 	_SigKill     = 1 << 1 // if signal.Notify doesn't take it, exit quietly
+ 	_SigThrow    = 1 << 2 // if signal.Notify doesn't take it, exit loudly
+ 	_SigPanic    = 1 << 3 // if the signal is from the kernel, panic
+ 	_SigDefault  = 1 << 4 // if the signal isn't explicitly requested, don't monitor it
+ 	_SigHandling = 1 << 5 // our signal handler is registered
+ 	_SigIgnored  = 1 << 6 // the signal was ignored before we registered for it
+ 	_SigGoExit   = 1 << 7 // cause all runtime procs to exit (only used on Plan 9).
+ )
+ 
+ // Layout of in-memory per-function information prepared by linker
+ // See http://golang.org/s/go12symtab.
+ // Keep in sync with linker and with ../../libmach/sym.c
+ // and with package debug/gosym and with symtab.go in package runtime.
+ type _func struct {
+ 	entry   uintptr // start pc
+ 	nameoff int32   // function name
+ 
+ 	args  int32 // in/out args size
+ 	frame int32 // legacy frame size; use pcsp if possible
+ 
+ 	pcsp      int32
+ 	pcfile    int32
+ 	pcln      int32
+ 	npcdata   int32
+ 	nfuncdata int32
+ }
+ 
+ // layout of Itab known to compilers
+ // allocated in non-garbage-collected memory
+ type itab struct {
+ 	inter  *interfacetype
+ 	_type  *_type
+ 	link   *itab
+ 	bad    int32
+ 	unused int32
+ 	fun    [0]uintptr
+ }
+ 
+ const (
+ 	// TODO: Generate in cmd/dist.
+ 	_NaCl    = 0
+ 	_Windows = 0
+ 	_Solaris = 0
+ 	_Plan9   = 0
+ )
+ 
+ // Lock-free stack node.
++// // Also known to export_test.go.
+ type lfnode struct {
 -	next    *lfnode
++	next    uint64
+ 	pushcnt uintptr
+ }
+ 
+ // Parallel for descriptor.
+ type parfor struct {
+ 	body    unsafe.Pointer // go func(*parfor, uint32), executed for each element
+ 	done    uint32         // number of idle threads
+ 	nthr    uint32         // total number of threads
+ 	nthrmax uint32         // maximum number of threads
+ 	thrseq  uint32         // thread id sequencer
+ 	cnt     uint32         // iteration space [0, cnt)
+ 	ctx     unsafe.Pointer // arbitrary user context
+ 	wait    bool           // if true, wait while all threads finish processing,
+ 	// otherwise parfor may return while other threads are still working
+ 	thr *parforthread // array of thread descriptors
+ 	pad uint32        // to align parforthread.pos for 64-bit atomic operations
+ 	// stats
+ 	nsteal     uint64
+ 	nstealcnt  uint64
+ 	nprocyield uint64
+ 	nosyield   uint64
+ 	nsleep     uint64
+ }
+ 
+ // Track memory allocated by code not written in Go during a cgo call,
+ // so that the garbage collector can see them.
+ type cgomal struct {
+ 	next  *cgomal
+ 	alloc unsafe.Pointer
+ }
+ 
+ // Holds variables parsed from GODEBUG env var.
+ type debugvars struct {
+ 	allocfreetrace int32
+ 	efence         int32
+ 	gctrace        int32
+ 	gcdead         int32
+ 	scheddetail    int32
+ 	schedtrace     int32
+ 	scavenge       int32
+ }
+ 
+ // Indicates to write barrier and sychronization task to preform.
+ const (
 -	_GCoff     = iota // stop and start             nop
 -	_GCquiesce        // stop and start             nop
 -	_GCstw            // stop the ps                nop
 -	_GCmark           // scan the stacks and start  no white to black
 -	_GCsweep          // stop and start             nop
++	_GCoff             = iota // GC not running, write barrier disabled
++	_GCquiesce                // unused state
++	_GCstw                    // unused state
++	_GCscan                   // GC collecting roots into workbufs, write barrier disabled
++	_GCmark                   // GC marking from workbufs, write barrier ENABLED
++	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
++	_GCsweep                  // GC mark completed; sweeping in background, write barrier disabled
+ )
+ 
+ type forcegcstate struct {
+ 	lock mutex
+ 	g    *g
+ 	idle uint32
+ }
+ 
+ var gcphase uint32
+ 
+ /*
+  * known to compiler
+  */
+ const (
+ 	_Structrnd = regSize
+ )
+ 
+ var startup_random_data *byte
+ var startup_random_data_len uint32
+ 
+ var invalidptr int32
+ 
+ const (
+ 	// hashinit wants this many random bytes
+ 	_HashRandomBytes = 32
+ )
+ 
+ /*
+  * deferred subroutine calls
+  */
+ type _defer struct {
+ 	siz     int32
+ 	started bool
+ 	argp    uintptr // where args were copied from
+ 	pc      uintptr
+ 	fn      *funcval
+ 	_panic  *_panic // panic that is running defer
+ 	link    *_defer
+ }
+ 
+ /*
+  * panics
+  */
+ type _panic struct {
+ 	argp      unsafe.Pointer // pointer to arguments of deferred call run during panic; cannot move - known to liblink
+ 	arg       interface{}    // argument to panic
+ 	link      *_panic        // link to earlier panic
+ 	recovered bool           // whether this panic is over
+ 	aborted   bool           // the panic was aborted
+ }
+ 
+ /*
+  * stack traces
+  */
+ 
+ type stkframe struct {
+ 	fn       *_func     // function being run
+ 	pc       uintptr    // program counter within fn
+ 	continpc uintptr    // program counter where execution can continue, or 0 if not
+ 	lr       uintptr    // program counter at caller aka link register
+ 	sp       uintptr    // stack pointer at pc
+ 	fp       uintptr    // stack pointer at caller aka frame pointer
+ 	varp     uintptr    // top of local variables
+ 	argp     uintptr    // pointer to function arguments
+ 	arglen   uintptr    // number of bytes at argp
+ 	argmap   *bitvector // force use of this argmap
+ }
+ 
+ const (
+ 	_TraceRuntimeFrames = 1 << 0 // include frames for internal runtime functions.
+ 	_TraceTrap          = 1 << 1 // the initial PC, SP are from a trap, not a return PC from a call
+ )
+ 
+ const (
+ 	// The maximum number of frames we print for a traceback
+ 	_TracebackMaxFrames = 100
+ )
+ 
+ var (
+ 	emptystring string
+ 	allg        **g
+ 	allglen     uintptr
+ 	lastg       *g
+ 	allm        *m
+ 	allp        [_MaxGomaxprocs + 1]*p
+ 	gomaxprocs  int32
+ 	needextram  uint32
+ 	panicking   uint32
+ 	goos        *int8
+ 	ncpu        int32
+ 	iscgo       bool
+ 	cpuid_ecx   uint32
+ 	cpuid_edx   uint32
+ 	debug       debugvars
+ 	signote     note
+ 	forcegc     forcegcstate
+ 	sched       schedt
+ 	newprocs    int32
+ )
+ 
+ /*
+  * mutual exclusion locks.  in the uncontended case,
+  * as fast as spin locks (just a few user-level instructions),
+  * but on the contention path they sleep in the kernel.
+  * a zeroed Mutex is unlocked (no need to initialize each lock).
+  */
+ 
+ /*
+  * sleep and wakeup on one-time events.
+  * before any calls to notesleep or notewakeup,
+  * must call noteclear to initialize the Note.
+  * then, exactly one thread can call notesleep
+  * and exactly one thread can call notewakeup (once).
+  * once notewakeup has been called, the notesleep
+  * will return.  future notesleep will return immediately.
+  * subsequent noteclear must be called only after
+  * previous notesleep has returned, e.g. it's disallowed
+  * to call noteclear straight after notewakeup.
+  *
+  * notetsleep is like notesleep but wakes up after
+  * a given number of nanoseconds even if the event
+  * has not yet happened.  if a goroutine uses notetsleep to
+  * wake up early, it must wait to call noteclear until it
+  * can be sure that no other goroutine is calling
+  * notewakeup.
+  *
+  * notesleep/notetsleep are generally called on g0,
+  * notetsleepg is similar to notetsleep but is called on user g.
+  */
+ // bool	runtimeÂ·notetsleep(Note*, int64);  // false - timeout
+ // bool	runtimeÂ·notetsleepg(Note*, int64);  // false - timeout
+ 
+ /*
+  * Lock-free stack.
+  * Initialize uint64 head to 0, compare with 0 to test for emptiness.
+  * The stack does not keep pointers to nodes,
+  * so they can be garbage collected if there are no other pointers to nodes.
+  */
+ 
+ /*
+  * Parallel for over [0, n).
+  * body() is executed for each iteration.
+  * nthr - total number of worker threads.
+  * ctx - arbitrary user context.
+  * if wait=true, threads return from parfor() when all work is done;
+  * otherwise, threads can return while other threads are still finishing processing.
+  */
+ 
+ // for mmap, we only pass the lower 32 bits of file offset to the
+ // assembly routine; the higher bits (if required), should be provided
+ // by the assembly routine as 0.
diff --cc src/runtime/stack1.go
index 0000000000,40dfc76a6d..963f4fa731
mode 000000,100644..100644
--- a/src/runtime/stack1.go
+++ b/src/runtime/stack1.go
@@@ -1,0 -1,807 +1,818 @@@
+ // Copyright 2013 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ const (
+ 	// StackDebug == 0: no logging
+ 	//            == 1: logging of per-stack operations
+ 	//            == 2: logging of per-frame operations
+ 	//            == 3: logging of per-word updates
+ 	//            == 4: logging of per-word reads
+ 	stackDebug       = 0
+ 	stackFromSystem  = 0 // allocate stacks from system memory instead of the heap
+ 	stackFaultOnFree = 0 // old stacks are mapped noaccess to detect use after free
+ 	stackPoisonCopy  = 0 // fill stack that should not be accessed with garbage, to detect bad dereferences during copy
+ 
+ 	stackCache = 1
+ )
+ 
+ const (
+ 	uintptrMask = 1<<(8*ptrSize) - 1
+ 	poisonGC    = uintptrMask & 0xf969696969696969
+ 	poisonStack = uintptrMask & 0x6868686868686868
+ 
+ 	// Goroutine preemption request.
+ 	// Stored into g->stackguard0 to cause split stack check failure.
+ 	// Must be greater than any real sp.
+ 	// 0xfffffade in hex.
+ 	stackPreempt = uintptrMask & -1314
+ 
+ 	// Thread is forking.
+ 	// Stored into g->stackguard0 to cause split stack check failure.
+ 	// Must be greater than any real sp.
+ 	stackFork = uintptrMask & -1234
+ )
+ 
+ // Global pool of spans that have free stacks.
+ // Stacks are assigned an order according to size.
+ //     order = log_2(size/FixedStack)
+ // There is a free list for each order.
+ // TODO: one lock per order?
+ var stackpool [_NumStackOrders]mspan
+ var stackpoolmu mutex
+ 
+ var stackfreequeue stack
+ 
+ func stackinit() {
+ 	if _StackCacheSize&_PageMask != 0 {
+ 		gothrow("cache size must be a multiple of page size")
+ 	}
+ 	for i := range stackpool {
+ 		mSpanList_Init(&stackpool[i])
+ 	}
+ }
+ 
+ // Allocates a stack from the free pool.  Must be called with
+ // stackpoolmu held.
+ func stackpoolalloc(order uint8) *mlink {
+ 	list := &stackpool[order]
+ 	s := list.next
+ 	if s == list {
+ 		// no free stacks.  Allocate another span worth.
+ 		s = mHeap_AllocStack(&mheap_, _StackCacheSize>>_PageShift)
+ 		if s == nil {
+ 			gothrow("out of memory")
+ 		}
+ 		if s.ref != 0 {
+ 			gothrow("bad ref")
+ 		}
+ 		if s.freelist != nil {
+ 			gothrow("bad freelist")
+ 		}
+ 		for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order {
+ 			x := (*mlink)(unsafe.Pointer(uintptr(s.start)<<_PageShift + i))
+ 			x.next = s.freelist
+ 			s.freelist = x
+ 		}
+ 		mSpanList_Insert(list, s)
+ 	}
+ 	x := s.freelist
+ 	if x == nil {
+ 		gothrow("span has no free stacks")
+ 	}
+ 	s.freelist = x.next
+ 	s.ref++
+ 	if s.freelist == nil {
+ 		// all stacks in s are allocated.
+ 		mSpanList_Remove(s)
+ 	}
+ 	return x
+ }
+ 
+ // Adds stack x to the free pool.  Must be called with stackpoolmu held.
+ func stackpoolfree(x *mlink, order uint8) {
+ 	s := mHeap_Lookup(&mheap_, (unsafe.Pointer)(x))
+ 	if s.state != _MSpanStack {
+ 		gothrow("freeing stack not in a stack span")
+ 	}
+ 	if s.freelist == nil {
+ 		// s will now have a free stack
+ 		mSpanList_Insert(&stackpool[order], s)
+ 	}
+ 	x.next = s.freelist
+ 	s.freelist = x
+ 	s.ref--
+ 	if s.ref == 0 {
+ 		// span is completely free - return to heap
+ 		mSpanList_Remove(s)
+ 		s.freelist = nil
+ 		mHeap_FreeStack(&mheap_, s)
+ 	}
+ }
+ 
+ // stackcacherefill/stackcacherelease implement a global pool of stack segments.
+ // The pool is required to prevent unlimited growth of per-thread caches.
+ func stackcacherefill(c *mcache, order uint8) {
+ 	if stackDebug >= 1 {
+ 		print("stackcacherefill order=", order, "\n")
+ 	}
+ 
+ 	// Grab some stacks from the global cache.
+ 	// Grab half of the allowed capacity (to prevent thrashing).
+ 	var list *mlink
+ 	var size uintptr
+ 	lock(&stackpoolmu)
+ 	for size < _StackCacheSize/2 {
+ 		x := stackpoolalloc(order)
+ 		x.next = list
+ 		list = x
+ 		size += _FixedStack << order
+ 	}
+ 	unlock(&stackpoolmu)
+ 	c.stackcache[order].list = list
+ 	c.stackcache[order].size = size
+ }
+ 
+ func stackcacherelease(c *mcache, order uint8) {
+ 	if stackDebug >= 1 {
+ 		print("stackcacherelease order=", order, "\n")
+ 	}
+ 	x := c.stackcache[order].list
+ 	size := c.stackcache[order].size
+ 	lock(&stackpoolmu)
+ 	for size > _StackCacheSize/2 {
+ 		y := x.next
+ 		stackpoolfree(x, order)
+ 		x = y
+ 		size -= _FixedStack << order
+ 	}
+ 	unlock(&stackpoolmu)
+ 	c.stackcache[order].list = x
+ 	c.stackcache[order].size = size
+ }
+ 
+ func stackcache_clear(c *mcache) {
+ 	if stackDebug >= 1 {
+ 		print("stackcache clear\n")
+ 	}
+ 	lock(&stackpoolmu)
+ 	for order := uint8(0); order < _NumStackOrders; order++ {
+ 		x := c.stackcache[order].list
+ 		for x != nil {
+ 			y := x.next
+ 			stackpoolfree(x, order)
+ 			x = y
+ 		}
+ 		c.stackcache[order].list = nil
+ 		c.stackcache[order].size = 0
+ 	}
+ 	unlock(&stackpoolmu)
+ }
+ 
+ func stackalloc(n uint32) stack {
+ 	// Stackalloc must be called on scheduler stack, so that we
+ 	// never try to grow the stack during the code that stackalloc runs.
+ 	// Doing so would cause a deadlock (issue 1547).
+ 	thisg := getg()
+ 	if thisg != thisg.m.g0 {
+ 		gothrow("stackalloc not on scheduler stack")
+ 	}
+ 	if n&(n-1) != 0 {
+ 		gothrow("stack size not a power of 2")
+ 	}
+ 	if stackDebug >= 1 {
+ 		print("stackalloc ", n, "\n")
+ 	}
+ 
+ 	if debug.efence != 0 || stackFromSystem != 0 {
+ 		v := sysAlloc(round(uintptr(n), _PageSize), &memstats.stacks_sys)
+ 		if v == nil {
+ 			gothrow("out of memory (stackalloc)")
+ 		}
+ 		return stack{uintptr(v), uintptr(v) + uintptr(n)}
+ 	}
+ 
+ 	// Small stacks are allocated with a fixed-size free-list allocator.
+ 	// If we need a stack of a bigger size, we fall back on allocating
+ 	// a dedicated span.
+ 	var v unsafe.Pointer
+ 	if stackCache != 0 && n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
+ 		order := uint8(0)
+ 		n2 := n
+ 		for n2 > _FixedStack {
+ 			order++
+ 			n2 >>= 1
+ 		}
+ 		var x *mlink
+ 		c := thisg.m.mcache
+ 		if c == nil || thisg.m.gcing != 0 || thisg.m.helpgc != 0 {
+ 			// c == nil can happen in the guts of exitsyscall or
+ 			// procresize. Just get a stack from the global pool.
+ 			// Also don't touch stackcache during gc
+ 			// as it's flushed concurrently.
+ 			lock(&stackpoolmu)
+ 			x = stackpoolalloc(order)
+ 			unlock(&stackpoolmu)
+ 		} else {
+ 			x = c.stackcache[order].list
+ 			if x == nil {
+ 				stackcacherefill(c, order)
+ 				x = c.stackcache[order].list
+ 			}
+ 			c.stackcache[order].list = x.next
+ 			c.stackcache[order].size -= uintptr(n)
+ 		}
+ 		v = (unsafe.Pointer)(x)
+ 	} else {
+ 		s := mHeap_AllocStack(&mheap_, round(uintptr(n), _PageSize)>>_PageShift)
+ 		if s == nil {
+ 			gothrow("out of memory")
+ 		}
+ 		v = (unsafe.Pointer)(s.start << _PageShift)
+ 	}
+ 
+ 	if raceenabled {
+ 		racemalloc(v, uintptr(n))
+ 	}
+ 	if stackDebug >= 1 {
+ 		print("  allocated ", v, "\n")
+ 	}
+ 	return stack{uintptr(v), uintptr(v) + uintptr(n)}
+ }
+ 
+ func stackfree(stk stack) {
+ 	gp := getg()
+ 	n := stk.hi - stk.lo
+ 	v := (unsafe.Pointer)(stk.lo)
+ 	if n&(n-1) != 0 {
+ 		gothrow("stack not a power of 2")
+ 	}
+ 	if stackDebug >= 1 {
+ 		println("stackfree", v, n)
+ 		memclr(v, n) // for testing, clobber stack data
+ 	}
+ 	if debug.efence != 0 || stackFromSystem != 0 {
+ 		if debug.efence != 0 || stackFaultOnFree != 0 {
+ 			sysFault(v, n)
+ 		} else {
+ 			sysFree(v, n, &memstats.stacks_sys)
+ 		}
+ 		return
+ 	}
+ 	if stackCache != 0 && n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
+ 		order := uint8(0)
+ 		n2 := n
+ 		for n2 > _FixedStack {
+ 			order++
+ 			n2 >>= 1
+ 		}
+ 		x := (*mlink)(v)
+ 		c := gp.m.mcache
+ 		if c == nil || gp.m.gcing != 0 || gp.m.helpgc != 0 {
+ 			lock(&stackpoolmu)
+ 			stackpoolfree(x, order)
+ 			unlock(&stackpoolmu)
+ 		} else {
+ 			if c.stackcache[order].size >= _StackCacheSize {
+ 				stackcacherelease(c, order)
+ 			}
+ 			x.next = c.stackcache[order].list
+ 			c.stackcache[order].list = x
+ 			c.stackcache[order].size += n
+ 		}
+ 	} else {
+ 		s := mHeap_Lookup(&mheap_, v)
+ 		if s.state != _MSpanStack {
+ 			println(hex(s.start<<_PageShift), v)
+ 			gothrow("bad span state")
+ 		}
+ 		mHeap_FreeStack(&mheap_, s)
+ 	}
+ }
+ 
+ var maxstacksize uintptr = 1 << 20 // enough until runtime.main sets it for real
+ 
+ var mapnames = []string{
+ 	_BitsDead:    "---",
+ 	_BitsScalar:  "scalar",
+ 	_BitsPointer: "ptr",
+ }
+ 
+ // Stack frame layout
+ //
+ // (x86)
+ // +------------------+
+ // | args from caller |
+ // +------------------+ <- frame->argp
+ // |  return address  |
+ // +------------------+ <- frame->varp
+ // |     locals       |
+ // +------------------+
+ // |  args to callee  |
+ // +------------------+ <- frame->sp
+ //
+ // (arm)
+ // +------------------+
+ // | args from caller |
+ // +------------------+ <- frame->argp
+ // | caller's retaddr |
+ // +------------------+ <- frame->varp
+ // |     locals       |
+ // +------------------+
+ // |  args to callee  |
+ // +------------------+
+ // |  return address  |
+ // +------------------+ <- frame->sp
+ 
+ type adjustinfo struct {
+ 	old   stack
+ 	delta uintptr // ptr distance from old to new stack (newbase - oldbase)
+ }
+ 
+ // Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
+ // If so, it rewrites *vpp to point into the new stack.
+ func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
+ 	pp := (*unsafe.Pointer)(vpp)
+ 	p := *pp
+ 	if stackDebug >= 4 {
+ 		print("        ", pp, ":", p, "\n")
+ 	}
+ 	if adjinfo.old.lo <= uintptr(p) && uintptr(p) < adjinfo.old.hi {
+ 		*pp = add(p, adjinfo.delta)
+ 		if stackDebug >= 3 {
+ 			print("        adjust ptr ", pp, ":", p, " -> ", *pp, "\n")
+ 		}
+ 	}
+ }
+ 
+ type gobitvector struct {
+ 	n        uintptr
+ 	bytedata []uint8
+ }
+ 
+ func gobv(bv bitvector) gobitvector {
+ 	return gobitvector{
+ 		uintptr(bv.n),
+ 		(*[1 << 30]byte)(unsafe.Pointer(bv.bytedata))[:(bv.n+7)/8],
+ 	}
+ }
+ 
+ func ptrbits(bv *gobitvector, i uintptr) uint8 {
+ 	return (bv.bytedata[i/4] >> ((i & 3) * 2)) & 3
+ }
+ 
+ // bv describes the memory starting at address scanp.
+ // Adjust any pointers contained therein.
+ func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f *_func) {
+ 	bv := gobv(*cbv)
+ 	minp := adjinfo.old.lo
+ 	maxp := adjinfo.old.hi
+ 	delta := adjinfo.delta
+ 	num := uintptr(bv.n / _BitsPerPointer)
+ 	for i := uintptr(0); i < num; i++ {
+ 		if stackDebug >= 4 {
+ 			print("        ", add(scanp, i*ptrSize), ":", mapnames[ptrbits(&bv, i)], ":", hex(*(*uintptr)(add(scanp, i*ptrSize))), " # ", i, " ", bv.bytedata[i/4], "\n")
+ 		}
+ 		switch ptrbits(&bv, i) {
+ 		default:
+ 			gothrow("unexpected pointer bits")
+ 		case _BitsDead:
+ 			if debug.gcdead != 0 {
+ 				*(*unsafe.Pointer)(add(scanp, i*ptrSize)) = unsafe.Pointer(uintptr(poisonStack))
+ 			}
+ 		case _BitsScalar:
+ 			// ok
+ 		case _BitsPointer:
+ 			p := *(*unsafe.Pointer)(add(scanp, i*ptrSize))
+ 			up := uintptr(p)
+ 			if f != nil && 0 < up && up < _PageSize && invalidptr != 0 || up == poisonGC || up == poisonStack {
+ 				// Looks like a junk value in a pointer slot.
+ 				// Live analysis wrong?
+ 				getg().m.traceback = 2
+ 				print("runtime: bad pointer in frame ", gofuncname(f), " at ", add(scanp, i*ptrSize), ": ", p, "\n")
+ 				gothrow("invalid stack pointer")
+ 			}
+ 			if minp <= up && up < maxp {
+ 				if stackDebug >= 3 {
+ 					print("adjust ptr ", p, " ", gofuncname(f), "\n")
+ 				}
+ 				*(*unsafe.Pointer)(add(scanp, i*ptrSize)) = unsafe.Pointer(up + delta)
+ 			}
+ 		}
+ 	}
+ }
+ 
+ // Note: the argument/return area is adjusted by the callee.
+ func adjustframe(frame *stkframe, arg unsafe.Pointer) bool {
+ 	adjinfo := (*adjustinfo)(arg)
+ 	targetpc := frame.continpc
+ 	if targetpc == 0 {
+ 		// Frame is dead.
+ 		return true
+ 	}
+ 	f := frame.fn
+ 	if stackDebug >= 2 {
+ 		print("    adjusting ", funcname(f), " frame=[", hex(frame.sp), ",", hex(frame.fp), "] pc=", hex(frame.pc), " continpc=", hex(frame.continpc), "\n")
+ 	}
+ 	if f.entry == systemstack_switchPC {
+ 		// A special routine at the bottom of stack of a goroutine that does an systemstack call.
+ 		// We will allow it to be copied even though we don't
+ 		// have full GC info for it (because it is written in asm).
+ 		return true
+ 	}
+ 	if targetpc != f.entry {
+ 		targetpc--
+ 	}
+ 	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+ 	if pcdata == -1 {
+ 		pcdata = 0 // in prologue
+ 	}
+ 
+ 	// Adjust local variables if stack frame has been allocated.
+ 	size := frame.varp - frame.sp
+ 	var minsize uintptr
+ 	if thechar != '6' && thechar != '8' {
+ 		minsize = ptrSize
+ 	} else {
+ 		minsize = 0
+ 	}
+ 	if size > minsize {
+ 		var bv bitvector
+ 		stackmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+ 		if stackmap == nil || stackmap.n <= 0 {
+ 			print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
+ 			gothrow("missing stackmap")
+ 		}
+ 		// Locals bitmap information, scan just the pointers in locals.
+ 		if pcdata < 0 || pcdata >= stackmap.n {
+ 			// don't know where we are
+ 			print("runtime: pcdata is ", pcdata, " and ", stackmap.n, " locals stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
+ 			gothrow("bad symbol table")
+ 		}
+ 		bv = stackmapdata(stackmap, pcdata)
+ 		size = (uintptr(bv.n) * ptrSize) / _BitsPerPointer
+ 		if stackDebug >= 3 {
+ 			print("      locals ", pcdata, "/", stackmap.n, " ", size/ptrSize, " words ", bv.bytedata, "\n")
+ 		}
+ 		adjustpointers(unsafe.Pointer(frame.varp-size), &bv, adjinfo, f)
+ 	}
+ 
+ 	// Adjust arguments.
+ 	if frame.arglen > 0 {
+ 		var bv bitvector
+ 		if frame.argmap != nil {
+ 			bv = *frame.argmap
+ 		} else {
+ 			stackmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+ 			if stackmap == nil || stackmap.n <= 0 {
+ 				print("runtime: frame ", funcname(f), " untyped args ", frame.argp, "+", uintptr(frame.arglen), "\n")
+ 				gothrow("missing stackmap")
+ 			}
+ 			if pcdata < 0 || pcdata >= stackmap.n {
+ 				// don't know where we are
+ 				print("runtime: pcdata is ", pcdata, " and ", stackmap.n, " args stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
+ 				gothrow("bad symbol table")
+ 			}
+ 			bv = stackmapdata(stackmap, pcdata)
+ 		}
+ 		if stackDebug >= 3 {
+ 			print("      args\n")
+ 		}
+ 		adjustpointers(unsafe.Pointer(frame.argp), &bv, adjinfo, nil)
+ 	}
+ 	return true
+ }
+ 
+ func adjustctxt(gp *g, adjinfo *adjustinfo) {
+ 	adjustpointer(adjinfo, (unsafe.Pointer)(&gp.sched.ctxt))
+ }
+ 
+ func adjustdefers(gp *g, adjinfo *adjustinfo) {
+ 	// Adjust defer argument blocks the same way we adjust active stack frames.
+ 	tracebackdefers(gp, adjustframe, noescape(unsafe.Pointer(adjinfo)))
+ 
+ 	// Adjust pointers in the Defer structs.
+ 	// Defer structs themselves are never on the stack.
+ 	for d := gp._defer; d != nil; d = d.link {
+ 		adjustpointer(adjinfo, (unsafe.Pointer)(&d.fn))
+ 		adjustpointer(adjinfo, (unsafe.Pointer)(&d.argp))
+ 		adjustpointer(adjinfo, (unsafe.Pointer)(&d._panic))
+ 	}
+ }
+ 
+ func adjustpanics(gp *g, adjinfo *adjustinfo) {
+ 	// Panics are on stack and already adjusted.
+ 	// Update pointer to head of list in G.
+ 	adjustpointer(adjinfo, (unsafe.Pointer)(&gp._panic))
+ }
+ 
+ func adjustsudogs(gp *g, adjinfo *adjustinfo) {
+ 	// the data elements pointed to by a SudoG structure
+ 	// might be in the stack.
+ 	for s := gp.waiting; s != nil; s = s.waitlink {
+ 		adjustpointer(adjinfo, (unsafe.Pointer)(&s.elem))
+ 		adjustpointer(adjinfo, (unsafe.Pointer)(&s.selectdone))
+ 	}
+ }
+ 
+ func fillstack(stk stack, b byte) {
+ 	for p := stk.lo; p < stk.hi; p++ {
+ 		*(*byte)(unsafe.Pointer(p)) = b
+ 	}
+ }
+ 
+ // Copies gp's stack to a new stack of a different size.
++// Caller must have changed gp status to Gcopystack.
+ func copystack(gp *g, newsize uintptr) {
+ 	if gp.syscallsp != 0 {
+ 		gothrow("stack growth not allowed in system call")
+ 	}
+ 	old := gp.stack
+ 	if old.lo == 0 {
+ 		gothrow("nil stackbase")
+ 	}
+ 	used := old.hi - gp.sched.sp
+ 
+ 	// allocate new stack
+ 	new := stackalloc(uint32(newsize))
+ 	if stackPoisonCopy != 0 {
+ 		fillstack(new, 0xfd)
+ 	}
+ 	if stackDebug >= 1 {
+ 		print("copystack gp=", gp, " [", hex(old.lo), " ", hex(old.hi-used), " ", hex(old.hi), "]/", old.hi-old.lo, " -> [", hex(new.lo), " ", hex(new.hi-used), " ", hex(new.hi), "]/", newsize, "\n")
+ 	}
+ 
+ 	// adjust pointers in the to-be-copied frames
+ 	var adjinfo adjustinfo
+ 	adjinfo.old = old
+ 	adjinfo.delta = new.hi - old.hi
+ 	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, adjustframe, noescape(unsafe.Pointer(&adjinfo)), 0)
+ 
+ 	// adjust other miscellaneous things that have pointers into stacks.
+ 	adjustctxt(gp, &adjinfo)
+ 	adjustdefers(gp, &adjinfo)
+ 	adjustpanics(gp, &adjinfo)
+ 	adjustsudogs(gp, &adjinfo)
+ 
+ 	// copy the stack to the new location
+ 	if stackPoisonCopy != 0 {
+ 		fillstack(new, 0xfb)
+ 	}
+ 	memmove(unsafe.Pointer(new.hi-used), unsafe.Pointer(old.hi-used), used)
+ 
 -	oldstatus := readgstatus(gp)
 -	oldstatus &^= _Gscan
 -	if oldstatus == _Gwaiting || oldstatus == _Grunnable {
 -		casgstatus(gp, oldstatus, _Gcopystack) // oldstatus is Gwaiting or Grunnable
 -	} else {
 -		gothrow("copystack: bad status, not Gwaiting or Grunnable")
 -	}
 -
+ 	// Swap out old stack for new one
+ 	gp.stack = new
+ 	gp.stackguard0 = new.lo + _StackGuard // NOTE: might clobber a preempt request
+ 	gp.sched.sp = new.hi - used
+ 
 -	casgstatus(gp, _Gcopystack, oldstatus) // oldstatus is Gwaiting or Grunnable
 -
+ 	// free old stack
+ 	if stackPoisonCopy != 0 {
+ 		fillstack(old, 0xfc)
+ 	}
+ 	if newsize > old.hi-old.lo {
+ 		// growing, free stack immediately
+ 		stackfree(old)
+ 	} else {
+ 		// shrinking, queue up free operation.  We can't actually free the stack
+ 		// just yet because we might run into the following situation:
+ 		// 1) GC starts, scans a SudoG but does not yet mark the SudoG.elem pointer
+ 		// 2) The stack that pointer points to is shrunk
+ 		// 3) The old stack is freed
+ 		// 4) The containing span is marked free
+ 		// 5) GC attempts to mark the SudoG.elem pointer.  The marking fails because
+ 		//    the pointer looks like a pointer into a free span.
+ 		// By not freeing, we prevent step #4 until GC is done.
+ 		lock(&stackpoolmu)
+ 		*(*stack)(unsafe.Pointer(old.lo)) = stackfreequeue
+ 		stackfreequeue = old
+ 		unlock(&stackpoolmu)
+ 	}
+ }
+ 
+ // round x up to a power of 2.
+ func round2(x int32) int32 {
+ 	s := uint(0)
+ 	for 1<<s < x {
+ 		s++
+ 	}
+ 	return 1 << s
+ }
+ 
+ // Called from runtimeÂ·morestack when more stack is needed.
+ // Allocate larger stack and relocate to new stack.
+ // Stack growth is multiplicative, for constant amortized cost.
+ //
+ // g->atomicstatus will be Grunning or Gscanrunning upon entry.
+ // If the GC is trying to stop this g then it will set preemptscan to true.
+ func newstack() {
+ 	thisg := getg()
+ 	// TODO: double check all gp. shouldn't be getg().
+ 	if thisg.m.morebuf.g.stackguard0 == stackFork {
+ 		gothrow("stack growth after fork")
+ 	}
+ 	if thisg.m.morebuf.g != thisg.m.curg {
+ 		print("runtime: newstack called from g=", thisg.m.morebuf.g, "\n"+"\tm=", thisg.m, " m->curg=", thisg.m.curg, " m->g0=", thisg.m.g0, " m->gsignal=", thisg.m.gsignal, "\n")
+ 		morebuf := thisg.m.morebuf
+ 		traceback(morebuf.pc, morebuf.sp, morebuf.lr, morebuf.g)
+ 		gothrow("runtime: wrong goroutine in newstack")
+ 	}
+ 	if thisg.m.curg.throwsplit {
+ 		gp := thisg.m.curg
+ 		// Update syscallsp, syscallpc in case traceback uses them.
+ 		morebuf := thisg.m.morebuf
+ 		gp.syscallsp = morebuf.sp
+ 		gp.syscallpc = morebuf.pc
+ 		print("runtime: newstack sp=", hex(gp.sched.sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
+ 			"\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
+ 			"\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")
+ 		gothrow("runtime: stack split at bad time")
+ 	}
+ 
+ 	// The goroutine must be executing in order to call newstack,
+ 	// so it must be Grunning or Gscanrunning.
+ 
+ 	gp := thisg.m.curg
+ 	morebuf := thisg.m.morebuf
+ 	thisg.m.morebuf.pc = 0
+ 	thisg.m.morebuf.lr = 0
+ 	thisg.m.morebuf.sp = 0
+ 	thisg.m.morebuf.g = nil
+ 
+ 	casgstatus(gp, _Grunning, _Gwaiting)
+ 	gp.waitreason = "stack growth"
+ 
+ 	rewindmorestack(&gp.sched)
+ 
+ 	if gp.stack.lo == 0 {
+ 		gothrow("missing stack in newstack")
+ 	}
+ 	sp := gp.sched.sp
+ 	if thechar == '6' || thechar == '8' {
+ 		// The call to morestack cost a word.
+ 		sp -= ptrSize
+ 	}
+ 	if stackDebug >= 1 || sp < gp.stack.lo {
+ 		print("runtime: newstack sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
+ 			"\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
+ 			"\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")
+ 	}
+ 	if sp < gp.stack.lo {
+ 		print("runtime: gp=", gp, ", gp->status=", hex(readgstatus(gp)), "\n ")
+ 		print("runtime: split stack overflow: ", hex(sp), " < ", hex(gp.stack.lo), "\n")
+ 		gothrow("runtime: split stack overflow")
+ 	}
+ 
++	if gp.sched.ctxt != nil {
++		// morestack wrote sched.ctxt on its way in here,
++		// without a write barrier. Run the write barrier now.
++		// It is not possible to be preempted between then
++		// and now, so it's okay.
++		writebarrierptr_nostore((*uintptr)(unsafe.Pointer(&gp.sched.ctxt)), uintptr(gp.sched.ctxt))
++	}
++
+ 	if gp.stackguard0 == stackPreempt {
+ 		if gp == thisg.m.g0 {
+ 			gothrow("runtime: preempt g0")
+ 		}
+ 		if thisg.m.p == nil && thisg.m.locks == 0 {
+ 			gothrow("runtime: g is running but p is not")
+ 		}
+ 		if gp.preemptscan {
+ 			gcphasework(gp)
+ 			casgstatus(gp, _Gwaiting, _Grunning)
+ 			gp.stackguard0 = gp.stack.lo + _StackGuard
+ 			gp.preempt = false
+ 			gp.preemptscan = false // Tells the GC premption was successful.
+ 			gogo(&gp.sched)        // never return
+ 		}
+ 
+ 		// Be conservative about where we preempt.
+ 		// We are interested in preempting user Go code, not runtime code.
+ 		if thisg.m.locks != 0 || thisg.m.mallocing != 0 || thisg.m.gcing != 0 || thisg.m.p.status != _Prunning {
+ 			// Let the goroutine keep running for now.
+ 			// gp->preempt is set, so it will be preempted next time.
+ 			gp.stackguard0 = gp.stack.lo + _StackGuard
+ 			casgstatus(gp, _Gwaiting, _Grunning)
+ 			gogo(&gp.sched) // never return
+ 		}
+ 
+ 		// Act like goroutine called runtime.Gosched.
+ 		casgstatus(gp, _Gwaiting, _Grunning)
+ 		gosched_m(gp) // never return
+ 	}
+ 
+ 	// Allocate a bigger segment and move the stack.
+ 	oldsize := int(gp.stack.hi - gp.stack.lo)
+ 	newsize := oldsize * 2
+ 	if uintptr(newsize) > maxstacksize {
+ 		print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
+ 		gothrow("stack overflow")
+ 	}
+ 
 -	// Note that the concurrent GC might be scanning the stack as we try to replace it.
 -	// copystack takes care of the appropriate coordination with the stack scanner.
++	oldstatus := readgstatus(gp)
++	oldstatus &^= _Gscan
++	casgstatus(gp, oldstatus, _Gcopystack) // oldstatus is Gwaiting or Grunnable
++
++	// The concurrent GC will not scan the stack while we are doing the copy since
++	// the gp is in a Gcopystack status.
+ 	copystack(gp, uintptr(newsize))
+ 	if stackDebug >= 1 {
+ 		print("stack grow done\n")
+ 	}
 -	casgstatus(gp, _Gwaiting, _Grunning)
++	casgstatus(gp, _Gcopystack, _Grunning)
+ 	gogo(&gp.sched)
+ }
+ 
+ //go:nosplit
+ func nilfunc() {
+ 	*(*uint8)(nil) = 0
+ }
+ 
+ // adjust Gobuf as if it executed a call to fn
+ // and then did an immediate gosave.
+ func gostartcallfn(gobuf *gobuf, fv *funcval) {
+ 	var fn unsafe.Pointer
+ 	if fv != nil {
+ 		fn = (unsafe.Pointer)(fv.fn)
+ 	} else {
+ 		fn = unsafe.Pointer(funcPC(nilfunc))
+ 	}
+ 	gostartcall(gobuf, fn, (unsafe.Pointer)(fv))
+ }
+ 
+ // Maybe shrink the stack being used by gp.
+ // Called at garbage collection time.
+ func shrinkstack(gp *g) {
+ 	if readgstatus(gp) == _Gdead {
+ 		if gp.stack.lo != 0 {
+ 			// Free whole stack - it will get reallocated
+ 			// if G is used again.
+ 			stackfree(gp.stack)
+ 			gp.stack.lo = 0
+ 			gp.stack.hi = 0
+ 		}
+ 		return
+ 	}
+ 	if gp.stack.lo == 0 {
+ 		gothrow("missing stack in shrinkstack")
+ 	}
+ 
+ 	oldsize := gp.stack.hi - gp.stack.lo
+ 	newsize := oldsize / 2
+ 	if newsize < _FixedStack {
+ 		return // don't shrink below the minimum-sized stack
+ 	}
+ 	used := gp.stack.hi - gp.sched.sp
+ 	if used >= oldsize/4 {
+ 		return // still using at least 1/4 of the segment.
+ 	}
+ 
+ 	// We can't copy the stack if we're in a syscall.
+ 	// The syscall might have pointers into the stack.
+ 	if gp.syscallsp != 0 {
+ 		return
+ 	}
 -
 -	/* TODO
 -	if _Windows && gp.m != nil && gp.m.libcallsp != 0 {
++	if _Windows != 0 && gp.m != nil && gp.m.libcallsp != 0 {
+ 		return
+ 	}
 -	*/
+ 
+ 	if stackDebug > 0 {
+ 		print("shrinking stack ", oldsize, "->", newsize, "\n")
+ 	}
++
++	// This is being done in a Gscan state and was initiated by the GC so no need to move to
++	// the Gcopystate.
++	// The world is stopped, so the goroutine must be Gwaiting or Grunnable,
++	// and what it is is not changing underfoot.
++	oldstatus := readgstatus(gp) &^ _Gscan
++	if oldstatus != _Gwaiting && oldstatus != _Grunnable {
++		gothrow("status is not Gwaiting or Grunnable")
++	}
++	casgstatus(gp, oldstatus, _Gcopystack)
+ 	copystack(gp, newsize)
++	casgstatus(gp, _Gcopystack, oldstatus)
+ }
+ 
+ // Do any delayed stack freeing that was queued up during GC.
+ func shrinkfinish() {
+ 	lock(&stackpoolmu)
+ 	s := stackfreequeue
+ 	stackfreequeue = stack{}
+ 	unlock(&stackpoolmu)
+ 	for s.lo != 0 {
+ 		t := *(*stack)(unsafe.Pointer(s.lo))
+ 		stackfree(s)
+ 		s = t
+ 	}
+ }
+ 
+ //go:nosplit
+ func morestackc() {
+ 	systemstack(func() {
+ 		gothrow("attempt to execute C code on Go stack")
+ 	})
+ }
diff --cc src/runtime/vdso_none.go
index 0000000000,ac6f8cb18d..6f83ecc895
mode 000000,100644..100644
--- a/src/runtime/vdso_none.go
+++ b/src/runtime/vdso_none.go
@@@ -1,0 -1,10 +1,11 @@@
+ // Copyright 2014 The Go Authors.  All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // +build !linux !amd64
++// +build !linux !386
+ 
+ package runtime
+ 
+ func sysargs(argc int32, argv **byte) {
+ }