runtime: use backoff and ISB instruction to reduce contention in (*lfstack).pop and...

author fanzha02 <fannie.zhang@arm.com>

Tue, 14 Jan 2025 09:32:56 +0000 (09:32 +0000)

committer Michael Knyszek <mknyszek@google.com>

Thu, 23 Oct 2025 00:02:28 +0000 (17:02 -0700)
author fanzha02 <fannie.zhang@arm.com>
Tue, 14 Jan 2025 09:32:56 +0000 (09:32 +0000)
committer Michael Knyszek <mknyszek@google.com>
Thu, 23 Oct 2025 00:02:28 +0000 (17:02 -0700)
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s

index 8bbb6b8a8786d88af53354b8bdd8f3df35d195a6..902a7066aaa113e81949a30d477ba3b083644fd4 100644 (file)
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -1036,13 +1036,60 @@ aesloop:
         VMOV    V0.D[0], R0
         RET
  
+// The Arm architecture provides a user space accessible counter-timer which
+// is incremented at a fixed but machine-specific rate. Software can (spin)
+// wait until the counter-timer reaches some desired value.
+//
+// Armv8.7-A introduced the WFET (FEAT_WFxT) instruction, which allows the
+// processor to enter a low power state for a set time, or until an event is
+// received.
+//
+// However, WFET is not used here because it is only available on newer hardware,
+// and we aim to maintain compatibility with older Armv8-A platforms that do not
+// support this feature.
+//
+// As a fallback, we can instead use the ISB instruction to decrease processor
+// activity and thus power consumption between checks of the counter-timer.
+// Note that we do not depend on the latency of the ISB instruction which is
+// implementation specific. Actual delay comes from comparing against a fresh
+// read of the counter-timer value.
+//
+// Read more in this Arm blog post:
+// https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm
+
  TEXT runtime·procyieldAsm(SB),NOSPLIT,$0-0
         MOVWU   cycles+0(FP), R0
-       CBZ     R0, done
-again:
-       YIELD
-       SUBW    $1, R0
-       CBNZ    R0, again
+       CBZ      R0, done
+       //Prevent speculation of subsequent counter/timer reads and memory accesses.
+       ISB     $15
+       // If the delay is very short, just return.
+       // Hardcode 18ns as the first ISB delay.
+       CMP     $18, R0
+       BLS     done
+       // Adjust for overhead of initial ISB.
+       SUB     $18, R0, R0
+       // Convert the delay from nanoseconds to counter/timer ticks.
+       // Read the counter/timer frequency.
+       // delay_ticks = (delay * CNTFRQ_EL0) / 1e9
+       // With the below simplifications and adjustments,
+       // we are usually within 2% of the correct value:
+       // delay_ticks = (delay + delay / 16) * CNTFRQ_EL0 >> 30
+       MRS     CNTFRQ_EL0, R1
+       ADD     R0>>4, R0, R0
+       MUL     R1, R0, R0
+       LSR     $30, R0, R0
+       CBZ     R0, done
+       // start = current counter/timer value
+       MRS     CNTVCT_EL0, R2
+delay:
+       // Delay using ISB for all ticks.
+       ISB     $15
+       // Substract and compare to handle counter roll-over.
+       // counter_read() - start < delay_ticks
+       MRS     CNTVCT_EL0, R1
+       SUB     R2, R1, R1
+       CMP     R0, R1
+       BCC     delay
  done:
         RET
  
diff --git a/src/runtime/lfstack.go b/src/runtime/lfstack.go

index 8946c80348558cccdfa36e16051833cb44727b3e..1e2f5a296544f8ef5b105c70ddae348a2fbd42a2 100644 (file)
--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go
@@ -34,6 +34,11 @@ func (head *lfstack) push(node *lfnode) {
  }
  
  func (head *lfstack) pop() unsafe.Pointer {
+       var backoff uint32
+       // TODO: tweak backoff parameters on other architectures.
+       if GOARCH == "arm64" {
+               backoff = 128
+       }
         for {
                 old := atomic.Load64((*uint64)(head))
                 if old == 0 {
@@ -44,6 +49,16 @@ func (head *lfstack) pop() unsafe.Pointer {
                 if atomic.Cas64((*uint64)(head), old, next) {
                         return unsafe.Pointer(node)
                 }
+
+               // Use a backoff approach to reduce demand to the shared memory location
+               // decreases memory contention and allows for other threads to make quicker
+               // progress.
+               // Read more in this Arm blog post:
+               // https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm
+               procyield(backoff)
+               // Increase backoff time.
+               backoff += backoff / 2
+
         }
  }
  
diff --git a/src/runtime/mspanset.go b/src/runtime/mspanset.go

index f0fb06286202f9b83719f11f7b6af5b8289edd1a..68d2dd0d1eee2fe640efe56945d8004985211c62 100644 (file)
--- a/src/runtime/mspanset.go
+++ b/src/runtime/mspanset.go
@@ -149,6 +149,11 @@ retry:
  // pop is safe to call concurrently with other pop and push operations.
  func (b *spanSet) pop() *mspan {
         var head, tail uint32
+       var backoff uint32
+       // TODO: tweak backoff parameters on other architectures.
+       if GOARCH == "arm64" {
+               backoff = 128
+       }
  claimLoop:
         for {
                 headtail := b.index.load()
@@ -177,6 +182,14 @@ claimLoop:
                         if b.index.cas(headtail, makeHeadTailIndex(want+1, tail)) {
                                 break claimLoop
                         }
+                       // Use a backoff approach to reduce demand to the shared memory location
+                       // decreases memory contention and allows for other threads to make quicker
+                       // progress.
+                       // Read more in this Arm blog post:
+                       // https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm
+                       procyield(backoff)
+                       // Increase backoff time.
+                       backoff += backoff / 2
                         headtail = b.index.load()
                         head, tail = headtail.split()
                 }
author	fanzha02 <fannie.zhang@arm.com>
	Tue, 14 Jan 2025 09:32:56 +0000 (09:32 +0000)
committer	Michael Knyszek <mknyszek@google.com>
	Thu, 23 Oct 2025 00:02:28 +0000 (17:02 -0700)
src/runtime/asm_arm64.s		patch \| blob \| history
src/runtime/lfstack.go		patch \| blob \| history
src/runtime/mspanset.go		patch \| blob \| history