From fd050b3c6d0294b6d72adb014ec14b3e6bf4ad60 Mon Sep 17 00:00:00 2001
From: Rhys Hiltner <rhys.hiltner@gmail.com>
Date: Fri, 11 Oct 2024 15:31:18 -0700
Subject: [PATCH] runtime: unify lock2, allow deeper sleep
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The tri-state mutex implementation (unlocked, locked, sleeping) avoids
sleep/wake syscalls when contention is low or absent, but its
performance degrades when many threads are contending for a mutex to
execute a fast critical section.

A fast critical section means frequent unlock2 calls. Each of those
finds the mutex in the "sleeping" state and so wakes a sleeping thread,
even if many other threads are already awake and in the spin loop of
lock2 attempting to acquire the mutex for themselves. Many spinning
threads means wasting energy and CPU time that could be used by other
processes on the machine. Many threads all spinning on the same cache
line leads to performance collapse.

Merge the futex- and semaphore-based mutex implementations by using a
semaphore abstraction for futex platforms. Then, add a bit to the mutex
state word that communicates whether one of the waiting threads is awake
and spinning. When threads in lock2 see the new "spinning" bit, they can
sleep immediately. In unlock2, the "spinning" bit means we can save a
syscall and not wake a sleeping thread.

This brings up the real possibility of starvation: waiting threads are
able to enter a deeper sleep than before, since one of their peers can
volunteer to be the sole "spinning" thread and thus cause unlock2 to
skip the semawakeup call. Additionally, the waiting threads form a LIFO
stack so any wakeups that do occur will target threads that have gone to
sleep most recently. Counteract those effects by periodically waking the
thread at the bottom of the stack and allowing it to spin.

Exempt sched.lock from most of the new behaviors; it's often used by
several threads in sequence to do thread-specific work, so low-latency
handoff is a priority over improved throughput.

Gate use of this implementation behind GOEXPERIMENT=spinbitmutex, so
it's easy to disable. Enable it by default on supported platforms (the
most efficient implementation requires atomic.Xchg8).

Fixes #68578

    goos: linux
    goarch: amd64
    pkg: runtime
    cpu: 13th Gen Intel(R) Core(TM) i7-13700H
                                â      old       â                 new                  â
                                â     sec/op     â    sec/op     vs base                â
    MutexContention                 17.82n Â±   0%   17.74n Â±  0%   -0.42% (p=0.000 n=10)
    MutexContention-2               22.17n Â±   9%   19.85n Â± 12%        ~ (p=0.089 n=10)
    MutexContention-3               26.14n Â±  14%   20.81n Â± 13%  -20.41% (p=0.000 n=10)
    MutexContention-4               29.28n Â±   8%   21.19n Â± 10%  -27.62% (p=0.000 n=10)
    MutexContention-5               31.79n Â±   2%   21.98n Â± 10%  -30.83% (p=0.000 n=10)
    MutexContention-6               34.63n Â±   1%   22.58n Â±  5%  -34.79% (p=0.000 n=10)
    MutexContention-7               44.16n Â±   2%   23.14n Â±  7%  -47.59% (p=0.000 n=10)
    MutexContention-8               53.81n Â±   3%   23.66n Â±  6%  -56.04% (p=0.000 n=10)
    MutexContention-9               65.58n Â±   4%   23.91n Â±  9%  -63.54% (p=0.000 n=10)
    MutexContention-10              77.35n Â±   3%   26.06n Â±  9%  -66.31% (p=0.000 n=10)
    MutexContention-11              89.62n Â±   1%   25.56n Â±  9%  -71.47% (p=0.000 n=10)
    MutexContention-12             102.45n Â±   2%   25.57n Â±  7%  -75.04% (p=0.000 n=10)
    MutexContention-13             111.95n Â±   1%   24.59n Â±  8%  -78.04% (p=0.000 n=10)
    MutexContention-14             123.95n Â±   3%   24.42n Â±  6%  -80.30% (p=0.000 n=10)
    MutexContention-15             120.80n Â±  10%   25.54n Â±  6%  -78.86% (p=0.000 n=10)
    MutexContention-16             128.10n Â±  25%   26.95n Â±  4%  -78.96% (p=0.000 n=10)
    MutexContention-17             139.80n Â±  18%   24.96n Â±  5%  -82.14% (p=0.000 n=10)
    MutexContention-18             141.35n Â±   7%   25.05n Â±  8%  -82.27% (p=0.000 n=10)
    MutexContention-19             151.35n Â±  18%   25.72n Â±  6%  -83.00% (p=0.000 n=10)
    MutexContention-20             153.30n Â±  20%   24.75n Â±  6%  -83.85% (p=0.000 n=10)
    MutexHandoff/Solo-20            13.54n Â±   1%   13.61n Â±  4%        ~ (p=0.206 n=10)
    MutexHandoff/FastPingPong-20    141.3n Â± 209%   164.8n Â± 49%        ~ (p=0.436 n=10)
    MutexHandoff/SlowPingPong-20    1.572Âµ Â±  16%   1.804Âµ Â± 19%  +14.76% (p=0.015 n=10)
    geomean                         74.34n          30.26n        -59.30%

    goos: darwin
    goarch: arm64
    pkg: runtime
    cpu: Apple M1
                                â     old      â                 new                  â
                                â    sec/op    â    sec/op     vs base                â
    MutexContention               13.86n Â±  3%   12.09n Â±  3%  -12.73% (p=0.000 n=10)
    MutexContention-2             15.88n Â±  1%   16.50n Â±  2%   +3.94% (p=0.001 n=10)
    MutexContention-3             18.45n Â±  2%   16.88n Â±  2%   -8.54% (p=0.000 n=10)
    MutexContention-4             20.01n Â±  2%   18.94n Â± 18%        ~ (p=0.469 n=10)
    MutexContention-5             22.60n Â±  1%   17.51n Â±  9%  -22.50% (p=0.000 n=10)
    MutexContention-6             23.93n Â±  2%   17.35n Â±  2%  -27.48% (p=0.000 n=10)
    MutexContention-7             24.69n Â±  1%   17.15n Â±  3%  -30.54% (p=0.000 n=10)
    MutexContention-8             25.01n Â±  1%   17.33n Â±  2%  -30.69% (p=0.000 n=10)
    MutexHandoff/Solo-8           13.96n Â±  4%   12.04n Â±  4%  -13.78% (p=0.000 n=10)
    MutexHandoff/FastPingPong-8   68.89n Â±  4%   64.62n Â±  2%   -6.20% (p=0.000 n=10)
    MutexHandoff/SlowPingPong-8   9.698Âµ Â± 22%   9.646Âµ Â± 35%        ~ (p=0.912 n=10)
    geomean                       38.20n         32.53n        -14.84%

Change-Id: I0058c75eadf282d08eea7fce0d426f0518039f7c
Reviewed-on: https://go-review.googlesource.com/c/go/+/620435
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Auto-Submit: Rhys Hiltner <rhys.hiltner@gmail.com>
---
 src/internal/buildcfg/exp.go       |   7 +
 src/runtime/lock_futex_tristate.go |   2 +
 src/runtime/lock_js.go             |   2 +
 src/runtime/lock_sema.go           |   4 +
 src/runtime/lock_sema_tristate.go  |   8 +-
 src/runtime/lock_spinbit.go        | 369 +++++++++++++++++++++++++++++
 src/runtime/lock_wasip1.go         |   2 +
 src/runtime/proc.go                |   2 +
 src/runtime/runtime2.go            |   7 +
 9 files changed, 399 insertions(+), 4 deletions(-)
 create mode 100644 src/runtime/lock_spinbit.go

diff --git a/src/internal/buildcfg/exp.go b/src/internal/buildcfg/exp.go
index f71cada455..c8ff974767 100644
--- a/src/internal/buildcfg/exp.go
+++ b/src/internal/buildcfg/exp.go
@@ -67,12 +67,19 @@ func ParseGOEXPERIMENT(goos, goarch, goexp string) (*ExperimentFlags, error) {
 		regabiSupported = true
 	}
 
+	var haveXchg8 bool
+	switch goarch {
+	case "386", "amd64", "arm", "arm64", "ppc64le", "ppc64":
+		haveXchg8 = true
+	}
+
 	baseline := goexperiment.Flags{
 		RegabiWrappers:   regabiSupported,
 		RegabiArgs:       regabiSupported,
 		CoverageRedesign: true,
 		AliasTypeParams:  true,
 		SwissMap:         true,
+		SpinbitMutex:     haveXchg8,
 	}
 
 	// Start with the statically enabled set of experiments.
diff --git a/src/runtime/lock_futex_tristate.go b/src/runtime/lock_futex_tristate.go
index dea4323f1e..b7df18c86c 100644
--- a/src/runtime/lock_futex_tristate.go
+++ b/src/runtime/lock_futex_tristate.go
@@ -38,6 +38,8 @@ const (
 
 type mWaitList struct{}
 
+func lockVerifyMSize() {}
+
 func mutexContended(l *mutex) bool {
 	return atomic.Load(key32(&l.key)) > mutex_locked
 }
diff --git a/src/runtime/lock_js.go b/src/runtime/lock_js.go
index bc62c7985d..a40e301085 100644
--- a/src/runtime/lock_js.go
+++ b/src/runtime/lock_js.go
@@ -28,6 +28,8 @@ const (
 
 type mWaitList struct{}
 
+func lockVerifyMSize() {}
+
 func mutexContended(l *mutex) bool {
 	return false
 }
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index bddb8adea7..3e1b07b918 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -11,6 +11,10 @@ import (
 	"unsafe"
 )
 
+const (
+	locked uintptr = 1
+)
+
 // One-time notifications.
 func noteclear(n *note) {
 	n.key = 0
diff --git a/src/runtime/lock_sema_tristate.go b/src/runtime/lock_sema_tristate.go
index c1f22c5de1..4375791d46 100644
--- a/src/runtime/lock_sema_tristate.go
+++ b/src/runtime/lock_sema_tristate.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build aix || darwin || netbsd || openbsd || plan9 || solaris || windows || ((dragonfly || freebsd || linux) && goexperiment.spinbitmutex)
+//go:build (aix || darwin || netbsd || openbsd || plan9 || solaris || windows) && !goexperiment.spinbitmutex
 
 package runtime
 
@@ -24,8 +24,6 @@ import (
 //	func semawakeup(mp *m)
 //		Wake up mp, which is or will soon be sleeping on its semaphore.
 const (
-	locked uintptr = 1
-
 	active_spin     = 4
 	active_spin_cnt = 30
 	passive_spin    = 1
@@ -42,6 +40,8 @@ type mWaitList struct {
 	next muintptr // next m waiting for lock
 }
 
+func lockVerifyMSize() {}
+
 func mutexContended(l *mutex) bool {
 	return atomic.Loaduintptr(&l.key) > locked
 }
@@ -132,7 +132,7 @@ func unlock2(l *mutex) {
 			mp = muintptr(v &^ locked).ptr()
 			if atomic.Casuintptr(&l.key, v, uintptr(mp.mWaitList.next)) {
 				// Dequeued an M.  Wake it.
-				semawakeup(mp)
+				semawakeup(mp) // no use of mp after this point; it's awake
 				break
 			}
 		}
diff --git a/src/runtime/lock_spinbit.go b/src/runtime/lock_spinbit.go
new file mode 100644
index 0000000000..1f9f289bbf
--- /dev/null
+++ b/src/runtime/lock_spinbit.go
@@ -0,0 +1,369 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || plan9 || solaris || windows) && goexperiment.spinbitmutex
+
+package runtime
+
+import (
+	"internal/goarch"
+	"internal/runtime/atomic"
+	"unsafe"
+)
+
+// This implementation depends on OS-specific implementations of
+//
+//	func semacreate(mp *m)
+//		Create a semaphore for mp, if it does not already have one.
+//
+//	func semasleep(ns int64) int32
+//		If ns < 0, acquire m's semaphore and return 0.
+//		If ns >= 0, try to acquire m's semaphore for at most ns nanoseconds.
+//		Return 0 if the semaphore was acquired, -1 if interrupted or timed out.
+//
+//	func semawakeup(mp *m)
+//		Wake up mp, which is or will soon be sleeping on its semaphore.
+
+// The mutex state consists of four flags and a pointer. The flag at bit 0,
+// mutexLocked, represents the lock itself. Bit 1, mutexSleeping, is a hint that
+// the pointer is non-nil. The fast paths for locking and unlocking the mutex
+// are based on atomic 8-bit swap operations on the low byte; bits 2 through 7
+// are unused.
+//
+// Bit 8, mutexSpinning, is a try-lock that grants a waiting M permission to
+// spin on the state word. Most other Ms must attempt to spend their time
+// sleeping to reduce traffic on the cache line. This is the "spin bit" for
+// which the implementation is named. (The anti-starvation mechanism also grants
+// temporary permission for an M to spin.)
+//
+// Bit 9, mutexStackLocked, is a try-lock that grants an unlocking M permission
+// to inspect the list of waiting Ms and to pop an M off of that stack.
+//
+// The upper bits hold a (partial) pointer to the M that most recently went to
+// sleep. The sleeping Ms form a stack linked by their mWaitList.next fields.
+// Because the fast paths use an 8-bit swap on the low byte of the state word,
+// we'll need to reconstruct the full M pointer from the bits we have. Most Ms
+// are allocated on the heap, and have a known alignment and base offset. (The
+// offset is due to mallocgc's allocation headers.) The main program thread uses
+// a static M value, m0. We check for m0 specifically and add a known offset
+// otherwise.
+
+const (
+	active_spin     = 4  // referenced in proc.go for sync.Mutex implementation
+	active_spin_cnt = 30 // referenced in proc.go for sync.Mutex implementation
+)
+
+const (
+	mutexLocked      = 0x001
+	mutexSleeping    = 0x002
+	mutexSpinning    = 0x100
+	mutexStackLocked = 0x200
+	mutexMMask       = 0x3FF
+	mutexMOffset     = mallocHeaderSize // alignment of heap-allocated Ms (those other than m0)
+
+	mutexActiveSpinCount  = 4
+	mutexActiveSpinSize   = 30
+	mutexPassiveSpinCount = 1
+
+	mutexTailWakePeriod = 16
+)
+
+//go:nosplit
+func key8(p *uintptr) *uint8 {
+	if goarch.BigEndian {
+		return &(*[8]uint8)(unsafe.Pointer(p))[goarch.PtrSize/1-1]
+	}
+	return &(*[8]uint8)(unsafe.Pointer(p))[0]
+}
+
+// mWaitList is part of the M struct, and holds the list of Ms that are waiting
+// for a particular runtime.mutex.
+//
+// When an M is unable to immediately obtain a lock, it adds itself to the list
+// of Ms waiting for the lock. It does that via this struct's next field,
+// forming a singly-linked list with the mutex's key field pointing to the head
+// of the list.
+type mWaitList struct {
+	next muintptr // next m waiting for lock
+}
+
+// lockVerifyMSize confirms that we can recreate the low bits of the M pointer.
+func lockVerifyMSize() {
+	size := roundupsize(unsafe.Sizeof(m{}), false) + mallocHeaderSize
+	if size&mutexMMask != 0 {
+		print("M structure uses sizeclass ", size, "/", hex(size), " bytes; ",
+			"incompatible with mutex flag mask ", hex(mutexMMask), "\n")
+		throw("runtime.m memory alignment too small for spinbit mutex")
+	}
+}
+
+// mutexWaitListHead recovers a full muintptr that was missing its low bits.
+// With the exception of the static m0 value, it requires allocating runtime.m
+// values in a size class with a particular minimum alignment. The 2048-byte
+// size class allows recovering the full muintptr value even after overwriting
+// the low 11 bits with flags. We can use those 11 bits as 3 flags and an
+// atomically-swapped byte.
+//
+//go:nosplit
+func mutexWaitListHead(v uintptr) muintptr {
+	if highBits := v &^ mutexMMask; highBits == 0 {
+		return 0
+	} else if m0bits := muintptr(unsafe.Pointer(&m0)); highBits == uintptr(m0bits)&^mutexMMask {
+		return m0bits
+	} else {
+		return muintptr(highBits + mutexMOffset)
+	}
+}
+
+// mutexPreferLowLatency reports if this mutex prefers low latency at the risk
+// of performance collapse. If so, we can allow all waiting threads to spin on
+// the state word rather than go to sleep.
+//
+// TODO: We could have the waiting Ms each spin on their own private cache line,
+// especially if we can put a bound on the on-CPU time that would consume.
+//
+// TODO: If there's a small set of mutex values with special requirements, they
+// could make use of a more specialized lock2/unlock2 implementation. Otherwise,
+// we're constrained to what we can fit within a single uintptr with no
+// additional storage on the M for each lock held.
+//
+//go:nosplit
+func mutexPreferLowLatency(l *mutex) bool {
+	switch l {
+	default:
+		return false
+	case &sched.lock:
+		// We often expect sched.lock to pass quickly between Ms in a way that
+		// each M has unique work to do: for instance when we stop-the-world
+		// (bringing each P to idle) or add new netpoller-triggered work to the
+		// global run queue.
+		return true
+	}
+}
+
+func mutexContended(l *mutex) bool {
+	return atomic.Loaduintptr(&l.key) > mutexLocked
+}
+
+func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
+	gp := getg()
+	if gp.m.locks < 0 {
+		throw("runtimeÂ·lock: lock count")
+	}
+	gp.m.locks++
+
+	k8 := key8(&l.key)
+
+	var v8 uint8
+	// Speculative grab for lock.
+	v8 = atomic.Xchg8(k8, mutexLocked)
+	if v8&mutexLocked == 0 {
+		if v8&mutexSleeping != 0 {
+			atomic.Or8(k8, mutexSleeping)
+		}
+		return
+	}
+	semacreate(gp.m)
+
+	timer := &lockTimer{lock: l}
+	timer.begin()
+	// On uniprocessors, no point spinning.
+	// On multiprocessors, spin for mutexActiveSpinCount attempts.
+	spin := 0
+	if ncpu > 1 {
+		spin = mutexActiveSpinCount
+	}
+
+	var weSpin, atTail bool
+	v := atomic.Loaduintptr(&l.key)
+tryAcquire:
+	for i := 0; ; i++ {
+		for v&mutexLocked == 0 {
+			if weSpin {
+				next := (v &^ mutexMMask) | (v & (mutexMMask &^ mutexSpinning)) | mutexLocked
+				if next&^mutexMMask != 0 {
+					next |= mutexSleeping
+				}
+				if atomic.Casuintptr(&l.key, v, next) {
+					timer.end()
+					return
+				}
+			} else {
+				prev8 := atomic.Xchg8(k8, mutexLocked|mutexSleeping)
+				if prev8&mutexLocked == 0 {
+					timer.end()
+					return
+				}
+			}
+			v = atomic.Loaduintptr(&l.key)
+		}
+
+		if !weSpin && v&mutexSpinning == 0 && atomic.Casuintptr(&l.key, v, v|mutexSpinning) {
+			v |= mutexSpinning
+			weSpin = true
+		}
+
+		if weSpin || atTail || mutexPreferLowLatency(l) {
+			if i < spin {
+				procyield(mutexActiveSpinSize)
+				v = atomic.Loaduintptr(&l.key)
+				continue tryAcquire
+			} else if i < spin+mutexPassiveSpinCount {
+				osyield() // TODO: Consider removing this step. See https://go.dev/issue/69268
+				v = atomic.Loaduintptr(&l.key)
+				continue tryAcquire
+			}
+		}
+
+		// Go to sleep
+		for v&mutexLocked != 0 {
+			// Store the current head of the list of sleeping Ms in our gp.m.mWaitList.next field
+			gp.m.mWaitList.next = mutexWaitListHead(v)
+
+			// Pack a (partial) pointer to this M with the current lock state bits
+			next := (uintptr(unsafe.Pointer(gp.m)) &^ mutexMMask) | v&mutexMMask | mutexSleeping
+			if weSpin { // If we were spinning, prepare to retire
+				next = next &^ mutexSpinning
+			}
+
+			if atomic.Casuintptr(&l.key, v, next) {
+				weSpin = false
+				// We've pushed ourselves onto the stack of waiters. Wait.
+				semasleep(-1)
+				atTail = gp.m.mWaitList.next == 0 // we were at risk of starving
+				gp.m.mWaitList.next = 0
+				i = 0
+				v = atomic.Loaduintptr(&l.key)
+				continue tryAcquire
+			}
+			v = atomic.Loaduintptr(&l.key)
+		}
+	}
+}
+
+func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
+// We might not be holding a p in this code.
+//
+//go:nowritebarrier
+func unlock2(l *mutex) {
+	gp := getg()
+
+	prev8 := atomic.Xchg8(key8(&l.key), 0)
+	if prev8&mutexLocked == 0 {
+		throw("unlock of unlocked lock")
+	}
+
+	if prev8&mutexSleeping != 0 {
+		unlock2Wake(l)
+	}
+
+	gp.m.mLockProfile.recordUnlock(l)
+	gp.m.locks--
+	if gp.m.locks < 0 {
+		throw("runtimeÂ·unlock: lock count")
+	}
+	if gp.m.locks == 0 && gp.preempt { // restore the preemption request in case we've cleared it in newstack
+		gp.stackguard0 = stackPreempt
+	}
+}
+
+// unlock2Wake updates the list of Ms waiting on l, waking an M if necessary.
+//
+//go:nowritebarrier
+func unlock2Wake(l *mutex) {
+	v := atomic.Loaduintptr(&l.key)
+
+	// On occasion, seek out and wake the M at the bottom of the stack so it
+	// doesn't starve.
+	antiStarve := cheaprandn(mutexTailWakePeriod) == 0
+	if !(antiStarve || // avoiding starvation may require a wake
+		v&mutexSpinning == 0 || // no spinners means we must wake
+		mutexPreferLowLatency(l)) { // prefer waiters be awake as much as possible
+		return
+	}
+
+	for {
+		if v&^mutexMMask == 0 || v&mutexStackLocked != 0 {
+			// No waiting Ms means nothing to do.
+			//
+			// If the stack lock is unavailable, its owner would make the same
+			// wake decisions that we would, so there's nothing for us to do.
+			//
+			// Although: This thread may have a different call stack, which
+			// would result in a different entry in the mutex contention profile
+			// (upon completion of go.dev/issue/66999). That could lead to weird
+			// results if a slow critical section ends but another thread
+			// quickly takes the lock, finishes its own critical section,
+			// releases the lock, and then grabs the stack lock. That quick
+			// thread would then take credit (blame) for the delay that this
+			// slow thread caused. The alternative is to have more expensive
+			// atomic operations (a CAS) on the critical path of unlock2.
+			return
+		}
+		// Other M's are waiting for the lock.
+		// Obtain the stack lock, and pop off an M.
+		next := v | mutexStackLocked
+		if atomic.Casuintptr(&l.key, v, next) {
+			break
+		}
+		v = atomic.Loaduintptr(&l.key)
+	}
+
+	// We own the mutexStackLocked flag. New Ms may push themselves onto the
+	// stack concurrently, but we're now the only thread that can remove or
+	// modify the Ms that are sleeping in the list.
+
+	var committed *m // If we choose an M within the stack, we've made a promise to wake it
+	for {
+		headM := v &^ mutexMMask
+		flags := v & (mutexMMask &^ mutexStackLocked) // preserve low bits, but release stack lock
+
+		mp := mutexWaitListHead(v).ptr()
+		wakem := committed
+		if committed == nil {
+			if v&mutexSpinning == 0 || mutexPreferLowLatency(l) {
+				wakem = mp
+			}
+			if antiStarve {
+				// Wake the M at the bottom of the stack of waiters. (This is
+				// O(N) with the number of waiters.)
+				wakem = mp
+				prev := mp
+				for {
+					next := wakem.mWaitList.next.ptr()
+					if next == nil {
+						break
+					}
+					prev, wakem = wakem, next
+				}
+				if wakem != mp {
+					prev.mWaitList.next = wakem.mWaitList.next
+					committed = wakem
+				}
+			}
+		}
+
+		if wakem == mp {
+			headM = uintptr(mp.mWaitList.next) &^ mutexMMask
+		}
+
+		next := headM | flags
+		if atomic.Casuintptr(&l.key, v, next) {
+			if wakem != nil {
+				// Claimed an M. Wake it.
+				semawakeup(wakem)
+			}
+			break
+		}
+
+		v = atomic.Loaduintptr(&l.key)
+	}
+}
diff --git a/src/runtime/lock_wasip1.go b/src/runtime/lock_wasip1.go
index f883841366..55153c3a05 100644
--- a/src/runtime/lock_wasip1.go
+++ b/src/runtime/lock_wasip1.go
@@ -21,6 +21,8 @@ const (
 
 type mWaitList struct{}
 
+func lockVerifyMSize() {}
+
 func mutexContended(l *mutex) bool {
 	return false
 }
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 068f0de4fb..343e7ec592 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -814,6 +814,8 @@ func schedinit() {
 	// extremely short.
 	lockInit(&memstats.heapStats.noPLock, lockRankLeafRank)
 
+	lockVerifyMSize()
+
 	// raceinit must be the first call to race detector.
 	// In particular, it must be done before mallocinit below calls racemapshadow.
 	gp := getg()
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index b8c710a816..03798d5699 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -8,6 +8,7 @@ import (
 	"internal/abi"
 	"internal/chacha8rand"
 	"internal/goarch"
+	"internal/goexperiment"
 	"internal/runtime/atomic"
 	"internal/runtime/sys"
 	"unsafe"
@@ -619,6 +620,12 @@ type m struct {
 	// Up to 10 locks held by this m, maintained by the lock ranking code.
 	locksHeldLen int
 	locksHeld    [10]heldLockInfo
+
+	// Size the runtime.m structure so it fits in the 2048-byte size class, and
+	// not in the next-smallest (1792-byte) size class. That leaves the 11 low
+	// bits of muintptr values available for flags, as required for
+	// GOEXPERIMENT=spinbitmutex.
+	_ [goexperiment.SpinbitMutexInt * 700 * (2 - goarch.PtrSize/4)]byte
 }
 
 type p struct {
-- 
2.51.0