runtime: unify lock2, allow deeper sleep

author Rhys Hiltner <rhys.hiltner@gmail.com>

Fri, 11 Oct 2024 22:31:18 +0000 (15:31 -0700)

committer Gopher Robot <gobot@golang.org>

Fri, 15 Nov 2024 21:16:04 +0000 (21:16 +0000)
author Rhys Hiltner <rhys.hiltner@gmail.com>
Fri, 11 Oct 2024 22:31:18 +0000 (15:31 -0700)
committer Gopher Robot <gobot@golang.org>
Fri, 15 Nov 2024 21:16:04 +0000 (21:16 +0000)
diff --git a/src/internal/buildcfg/exp.go b/src/internal/buildcfg/exp.go

index f71cada4558bf875fcb6ffd78395c8a261e50c2c..c8ff974767e10f9be4e588cbb662fcdd72bce007 100644 (file)
--- a/src/internal/buildcfg/exp.go
+++ b/src/internal/buildcfg/exp.go
@@ -67,12 +67,19 @@ func ParseGOEXPERIMENT(goos, goarch, goexp string) (*ExperimentFlags, error) {
                 regabiSupported = true
         }
  
+       var haveXchg8 bool
+       switch goarch {
+       case "386", "amd64", "arm", "arm64", "ppc64le", "ppc64":
+               haveXchg8 = true
+       }
+
         baseline := goexperiment.Flags{
                 RegabiWrappers:   regabiSupported,
                 RegabiArgs:       regabiSupported,
                 CoverageRedesign: true,
                 AliasTypeParams:  true,
                 SwissMap:         true,
+               SpinbitMutex:     haveXchg8,
         }
  
         // Start with the statically enabled set of experiments.
diff --git a/src/runtime/lock_futex_tristate.go b/src/runtime/lock_futex_tristate.go

index dea4323f1e2127aef54fb20f0e606b5be6220eb6..b7df18c86c04a2f3e76ecdc1d1724c3af2302a50 100644 (file)
--- a/src/runtime/lock_futex_tristate.go
+++ b/src/runtime/lock_futex_tristate.go
@@ -38,6 +38,8 @@ const (
  
  type mWaitList struct{}
  
+func lockVerifyMSize() {}
+
  func mutexContended(l *mutex) bool {
         return atomic.Load(key32(&l.key)) > mutex_locked
  }
diff --git a/src/runtime/lock_js.go b/src/runtime/lock_js.go

index bc62c7985d3763c92bc18dc6987819b4378dd68c..a40e30108500116e3d35f8c73bbbf74753bbd366 100644 (file)
--- a/src/runtime/lock_js.go
+++ b/src/runtime/lock_js.go
@@ -28,6 +28,8 @@ const (
  
  type mWaitList struct{}
  
+func lockVerifyMSize() {}
+
  func mutexContended(l *mutex) bool {
         return false
  }
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go

index bddb8adea72c9013f18c94a2c8b6a5610429c204..3e1b07b918b62eeca5705be699c4ae49b6ce61e1 100644 (file)
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -11,6 +11,10 @@ import (
         "unsafe"
  )
  
+const (
+       locked uintptr = 1
+)
+
  // One-time notifications.
  func noteclear(n *note) {
         n.key = 0
diff --git a/src/runtime/lock_sema_tristate.go b/src/runtime/lock_sema_tristate.go

index c1f22c5de136a4bb0d5e8a1c64ddcc463eba8888..4375791d46fb8e443370c7fa91f117606733d3b6 100644 (file)
--- a/src/runtime/lock_sema_tristate.go
+++ b/src/runtime/lock_sema_tristate.go
@@ -2,7 +2,7 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-//go:build aix || darwin || netbsd || openbsd || plan9 || solaris || windows || ((dragonfly || freebsd || linux) && goexperiment.spinbitmutex)
+//go:build (aix || darwin || netbsd || openbsd || plan9 || solaris || windows) && !goexperiment.spinbitmutex
  
  package runtime
  
@@ -24,8 +24,6 @@ import (
  //     func semawakeup(mp *m)
  //             Wake up mp, which is or will soon be sleeping on its semaphore.
  const (
-       locked uintptr = 1
-
         active_spin     = 4
         active_spin_cnt = 30
         passive_spin    = 1
@@ -42,6 +40,8 @@ type mWaitList struct {
         next muintptr // next m waiting for lock
  }
  
+func lockVerifyMSize() {}
+
  func mutexContended(l *mutex) bool {
         return atomic.Loaduintptr(&l.key) > locked
  }
@@ -132,7 +132,7 @@ func unlock2(l *mutex) {
                         mp = muintptr(v &^ locked).ptr()
                         if atomic.Casuintptr(&l.key, v, uintptr(mp.mWaitList.next)) {
                                 // Dequeued an M.  Wake it.
-                               semawakeup(mp)
+                               semawakeup(mp) // no use of mp after this point; it's awake
                                 break
                         }
                 }
diff --git a/src/runtime/lock_spinbit.go b/src/runtime/lock_spinbit.go

new file mode 100644 (file)

index 0000000..1f9f289
--- /dev/null
+++ b/src/runtime/lock_spinbit.go
@@ -0,0 +1,369 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || plan9 || solaris || windows) && goexperiment.spinbitmutex
+
+package runtime
+
+import (
+       "internal/goarch"
+       "internal/runtime/atomic"
+       "unsafe"
+)
+
+// This implementation depends on OS-specific implementations of
+//
+//     func semacreate(mp *m)
+//             Create a semaphore for mp, if it does not already have one.
+//
+//     func semasleep(ns int64) int32
+//             If ns < 0, acquire m's semaphore and return 0.
+//             If ns >= 0, try to acquire m's semaphore for at most ns nanoseconds.
+//             Return 0 if the semaphore was acquired, -1 if interrupted or timed out.
+//
+//     func semawakeup(mp *m)
+//             Wake up mp, which is or will soon be sleeping on its semaphore.
+
+// The mutex state consists of four flags and a pointer. The flag at bit 0,
+// mutexLocked, represents the lock itself. Bit 1, mutexSleeping, is a hint that
+// the pointer is non-nil. The fast paths for locking and unlocking the mutex
+// are based on atomic 8-bit swap operations on the low byte; bits 2 through 7
+// are unused.
+//
+// Bit 8, mutexSpinning, is a try-lock that grants a waiting M permission to
+// spin on the state word. Most other Ms must attempt to spend their time
+// sleeping to reduce traffic on the cache line. This is the "spin bit" for
+// which the implementation is named. (The anti-starvation mechanism also grants
+// temporary permission for an M to spin.)
+//
+// Bit 9, mutexStackLocked, is a try-lock that grants an unlocking M permission
+// to inspect the list of waiting Ms and to pop an M off of that stack.
+//
+// The upper bits hold a (partial) pointer to the M that most recently went to
+// sleep. The sleeping Ms form a stack linked by their mWaitList.next fields.
+// Because the fast paths use an 8-bit swap on the low byte of the state word,
+// we'll need to reconstruct the full M pointer from the bits we have. Most Ms
+// are allocated on the heap, and have a known alignment and base offset. (The
+// offset is due to mallocgc's allocation headers.) The main program thread uses
+// a static M value, m0. We check for m0 specifically and add a known offset
+// otherwise.
+
+const (
+       active_spin     = 4  // referenced in proc.go for sync.Mutex implementation
+       active_spin_cnt = 30 // referenced in proc.go for sync.Mutex implementation
+)
+
+const (
+       mutexLocked      = 0x001
+       mutexSleeping    = 0x002
+       mutexSpinning    = 0x100
+       mutexStackLocked = 0x200
+       mutexMMask       = 0x3FF
+       mutexMOffset     = mallocHeaderSize // alignment of heap-allocated Ms (those other than m0)
+
+       mutexActiveSpinCount  = 4
+       mutexActiveSpinSize   = 30
+       mutexPassiveSpinCount = 1
+
+       mutexTailWakePeriod = 16
+)
+
+//go:nosplit
+func key8(p *uintptr) *uint8 {
+       if goarch.BigEndian {
+               return &(*[8]uint8)(unsafe.Pointer(p))[goarch.PtrSize/1-1]
+       }
+       return &(*[8]uint8)(unsafe.Pointer(p))[0]
+}
+
+// mWaitList is part of the M struct, and holds the list of Ms that are waiting
+// for a particular runtime.mutex.
+//
+// When an M is unable to immediately obtain a lock, it adds itself to the list
+// of Ms waiting for the lock. It does that via this struct's next field,
+// forming a singly-linked list with the mutex's key field pointing to the head
+// of the list.
+type mWaitList struct {
+       next muintptr // next m waiting for lock
+}
+
+// lockVerifyMSize confirms that we can recreate the low bits of the M pointer.
+func lockVerifyMSize() {
+       size := roundupsize(unsafe.Sizeof(m{}), false) + mallocHeaderSize
+       if size&mutexMMask != 0 {
+               print("M structure uses sizeclass ", size, "/", hex(size), " bytes; ",
+                       "incompatible with mutex flag mask ", hex(mutexMMask), "\n")
+               throw("runtime.m memory alignment too small for spinbit mutex")
+       }
+}
+
+// mutexWaitListHead recovers a full muintptr that was missing its low bits.
+// With the exception of the static m0 value, it requires allocating runtime.m
+// values in a size class with a particular minimum alignment. The 2048-byte
+// size class allows recovering the full muintptr value even after overwriting
+// the low 11 bits with flags. We can use those 11 bits as 3 flags and an
+// atomically-swapped byte.
+//
+//go:nosplit
+func mutexWaitListHead(v uintptr) muintptr {
+       if highBits := v &^ mutexMMask; highBits == 0 {
+               return 0
+       } else if m0bits := muintptr(unsafe.Pointer(&m0)); highBits == uintptr(m0bits)&^mutexMMask {
+               return m0bits
+       } else {
+               return muintptr(highBits + mutexMOffset)
+       }
+}
+
+// mutexPreferLowLatency reports if this mutex prefers low latency at the risk
+// of performance collapse. If so, we can allow all waiting threads to spin on
+// the state word rather than go to sleep.
+//
+// TODO: We could have the waiting Ms each spin on their own private cache line,
+// especially if we can put a bound on the on-CPU time that would consume.
+//
+// TODO: If there's a small set of mutex values with special requirements, they
+// could make use of a more specialized lock2/unlock2 implementation. Otherwise,
+// we're constrained to what we can fit within a single uintptr with no
+// additional storage on the M for each lock held.
+//
+//go:nosplit
+func mutexPreferLowLatency(l *mutex) bool {
+       switch l {
+       default:
+               return false
+       case &sched.lock:
+               // We often expect sched.lock to pass quickly between Ms in a way that
+               // each M has unique work to do: for instance when we stop-the-world
+               // (bringing each P to idle) or add new netpoller-triggered work to the
+               // global run queue.
+               return true
+       }
+}
+
+func mutexContended(l *mutex) bool {
+       return atomic.Loaduintptr(&l.key) > mutexLocked
+}
+
+func lock(l *mutex) {
+       lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
+       gp := getg()
+       if gp.m.locks < 0 {
+               throw("runtime·lock: lock count")
+       }
+       gp.m.locks++
+
+       k8 := key8(&l.key)
+
+       var v8 uint8
+       // Speculative grab for lock.
+       v8 = atomic.Xchg8(k8, mutexLocked)
+       if v8&mutexLocked == 0 {
+               if v8&mutexSleeping != 0 {
+                       atomic.Or8(k8, mutexSleeping)
+               }
+               return
+       }
+       semacreate(gp.m)
+
+       timer := &lockTimer{lock: l}
+       timer.begin()
+       // On uniprocessors, no point spinning.
+       // On multiprocessors, spin for mutexActiveSpinCount attempts.
+       spin := 0
+       if ncpu > 1 {
+               spin = mutexActiveSpinCount
+       }
+
+       var weSpin, atTail bool
+       v := atomic.Loaduintptr(&l.key)
+tryAcquire:
+       for i := 0; ; i++ {
+               for v&mutexLocked == 0 {
+                       if weSpin {
+                               next := (v &^ mutexMMask) | (v & (mutexMMask &^ mutexSpinning)) | mutexLocked
+                               if next&^mutexMMask != 0 {
+                                       next |= mutexSleeping
+                               }
+                               if atomic.Casuintptr(&l.key, v, next) {
+                                       timer.end()
+                                       return
+                               }
+                       } else {
+                               prev8 := atomic.Xchg8(k8, mutexLocked|mutexSleeping)
+                               if prev8&mutexLocked == 0 {
+                                       timer.end()
+                                       return
+                               }
+                       }
+                       v = atomic.Loaduintptr(&l.key)
+               }
+
+               if !weSpin && v&mutexSpinning == 0 && atomic.Casuintptr(&l.key, v, v|mutexSpinning) {
+                       v |= mutexSpinning
+                       weSpin = true
+               }
+
+               if weSpin || atTail || mutexPreferLowLatency(l) {
+                       if i < spin {
+                               procyield(mutexActiveSpinSize)
+                               v = atomic.Loaduintptr(&l.key)
+                               continue tryAcquire
+                       } else if i < spin+mutexPassiveSpinCount {
+                               osyield() // TODO: Consider removing this step. See https://go.dev/issue/69268
+                               v = atomic.Loaduintptr(&l.key)
+                               continue tryAcquire
+                       }
+               }
+
+               // Go to sleep
+               for v&mutexLocked != 0 {
+                       // Store the current head of the list of sleeping Ms in our gp.m.mWaitList.next field
+                       gp.m.mWaitList.next = mutexWaitListHead(v)
+
+                       // Pack a (partial) pointer to this M with the current lock state bits
+                       next := (uintptr(unsafe.Pointer(gp.m)) &^ mutexMMask) | v&mutexMMask | mutexSleeping
+                       if weSpin { // If we were spinning, prepare to retire
+                               next = next &^ mutexSpinning
+                       }
+
+                       if atomic.Casuintptr(&l.key, v, next) {
+                               weSpin = false
+                               // We've pushed ourselves onto the stack of waiters. Wait.
+                               semasleep(-1)
+                               atTail = gp.m.mWaitList.next == 0 // we were at risk of starving
+                               gp.m.mWaitList.next = 0
+                               i = 0
+                               v = atomic.Loaduintptr(&l.key)
+                               continue tryAcquire
+                       }
+                       v = atomic.Loaduintptr(&l.key)
+               }
+       }
+}
+
+func unlock(l *mutex) {
+       unlockWithRank(l)
+}
+
+// We might not be holding a p in this code.
+//
+//go:nowritebarrier
+func unlock2(l *mutex) {
+       gp := getg()
+
+       prev8 := atomic.Xchg8(key8(&l.key), 0)
+       if prev8&mutexLocked == 0 {
+               throw("unlock of unlocked lock")
+       }
+
+       if prev8&mutexSleeping != 0 {
+               unlock2Wake(l)
+       }
+
+       gp.m.mLockProfile.recordUnlock(l)
+       gp.m.locks--
+       if gp.m.locks < 0 {
+               throw("runtime·unlock: lock count")
+       }
+       if gp.m.locks == 0 && gp.preempt { // restore the preemption request in case we've cleared it in newstack
+               gp.stackguard0 = stackPreempt
+       }
+}
+
+// unlock2Wake updates the list of Ms waiting on l, waking an M if necessary.
+//
+//go:nowritebarrier
+func unlock2Wake(l *mutex) {
+       v := atomic.Loaduintptr(&l.key)
+
+       // On occasion, seek out and wake the M at the bottom of the stack so it
+       // doesn't starve.
+       antiStarve := cheaprandn(mutexTailWakePeriod) == 0
+       if !(antiStarve || // avoiding starvation may require a wake
+               v&mutexSpinning == 0 || // no spinners means we must wake
+               mutexPreferLowLatency(l)) { // prefer waiters be awake as much as possible
+               return
+       }
+
+       for {
+               if v&^mutexMMask == 0 || v&mutexStackLocked != 0 {
+                       // No waiting Ms means nothing to do.
+                       //
+                       // If the stack lock is unavailable, its owner would make the same
+                       // wake decisions that we would, so there's nothing for us to do.
+                       //
+                       // Although: This thread may have a different call stack, which
+                       // would result in a different entry in the mutex contention profile
+                       // (upon completion of go.dev/issue/66999). That could lead to weird
+                       // results if a slow critical section ends but another thread
+                       // quickly takes the lock, finishes its own critical section,
+                       // releases the lock, and then grabs the stack lock. That quick
+                       // thread would then take credit (blame) for the delay that this
+                       // slow thread caused. The alternative is to have more expensive
+                       // atomic operations (a CAS) on the critical path of unlock2.
+                       return
+               }
+               // Other M's are waiting for the lock.
+               // Obtain the stack lock, and pop off an M.
+               next := v | mutexStackLocked
+               if atomic.Casuintptr(&l.key, v, next) {
+                       break
+               }
+               v = atomic.Loaduintptr(&l.key)
+       }
+
+       // We own the mutexStackLocked flag. New Ms may push themselves onto the
+       // stack concurrently, but we're now the only thread that can remove or
+       // modify the Ms that are sleeping in the list.
+
+       var committed *m // If we choose an M within the stack, we've made a promise to wake it
+       for {
+               headM := v &^ mutexMMask
+               flags := v & (mutexMMask &^ mutexStackLocked) // preserve low bits, but release stack lock
+
+               mp := mutexWaitListHead(v).ptr()
+               wakem := committed
+               if committed == nil {
+                       if v&mutexSpinning == 0 || mutexPreferLowLatency(l) {
+                               wakem = mp
+                       }
+                       if antiStarve {
+                               // Wake the M at the bottom of the stack of waiters. (This is
+                               // O(N) with the number of waiters.)
+                               wakem = mp
+                               prev := mp
+                               for {
+                                       next := wakem.mWaitList.next.ptr()
+                                       if next == nil {
+                                               break
+                                       }
+                                       prev, wakem = wakem, next
+                               }
+                               if wakem != mp {
+                                       prev.mWaitList.next = wakem.mWaitList.next
+                                       committed = wakem
+                               }
+                       }
+               }
+
+               if wakem == mp {
+                       headM = uintptr(mp.mWaitList.next) &^ mutexMMask
+               }
+
+               next := headM | flags
+               if atomic.Casuintptr(&l.key, v, next) {
+                       if wakem != nil {
+                               // Claimed an M. Wake it.
+                               semawakeup(wakem)
+                       }
+                       break
+               }
+
+               v = atomic.Loaduintptr(&l.key)
+       }
+}
diff --git a/src/runtime/lock_wasip1.go b/src/runtime/lock_wasip1.go

index f88384136608298922eb062943ec7100d181a59b..55153c3a05f54214869c3fb2e673295fe7525ae5 100644 (file)
--- a/src/runtime/lock_wasip1.go
+++ b/src/runtime/lock_wasip1.go
@@ -21,6 +21,8 @@ const (
  
  type mWaitList struct{}
  
+func lockVerifyMSize() {}
+
  func mutexContended(l *mutex) bool {
         return false
  }
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index 068f0de4fb296ea49c054625b9745ef1d534a42b..343e7ec59291ed0ece6a7eb82767f2e7b11acfc3 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -814,6 +814,8 @@ func schedinit() {
         // extremely short.
         lockInit(&memstats.heapStats.noPLock, lockRankLeafRank)
  
+       lockVerifyMSize()
+
         // raceinit must be the first call to race detector.
         // In particular, it must be done before mallocinit below calls racemapshadow.
         gp := getg()
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go

index b8c710a816bb864b08b042e507a7494f7b0ae368..03798d5699caf68b524d09d35891f478715d9e20 100644 (file)
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -8,6 +8,7 @@ import (
         "internal/abi"
         "internal/chacha8rand"
         "internal/goarch"
+       "internal/goexperiment"
         "internal/runtime/atomic"
         "internal/runtime/sys"
         "unsafe"
@@ -619,6 +620,12 @@ type m struct {
         // Up to 10 locks held by this m, maintained by the lock ranking code.
         locksHeldLen int
         locksHeld    [10]heldLockInfo
+
+       // Size the runtime.m structure so it fits in the 2048-byte size class, and
+       // not in the next-smallest (1792-byte) size class. That leaves the 11 low
+       // bits of muintptr values available for flags, as required for
+       // GOEXPERIMENT=spinbitmutex.
+       _ [goexperiment.SpinbitMutexInt * 700 * (2 - goarch.PtrSize/4)]byte
  }
  
  type p struct {
author	Rhys Hiltner <rhys.hiltner@gmail.com>
	Fri, 11 Oct 2024 22:31:18 +0000 (15:31 -0700)
committer	Gopher Robot <gobot@golang.org>
	Fri, 15 Nov 2024 21:16:04 +0000 (21:16 +0000)
src/internal/buildcfg/exp.go		patch \| blob \| history
src/runtime/lock_futex_tristate.go		patch \| blob \| history
src/runtime/lock_js.go		patch \| blob \| history
src/runtime/lock_sema.go		patch \| blob \| history
src/runtime/lock_sema_tristate.go		patch \| blob \| history
src/runtime/lock_spinbit.go	[new file with mode: 0644]	patch \| blob
src/runtime/lock_wasip1.go		patch \| blob \| history
src/runtime/proc.go		patch \| blob \| history
src/runtime/runtime2.go		patch \| blob \| history