runtime: save scalar registers off stack in amd64 async preemption

author Austin Clements <austin@google.com>

Wed, 30 Apr 2025 02:55:40 +0000 (22:55 -0400)

committer Gopher Robot <gobot@golang.org>

Tue, 5 Aug 2025 20:58:52 +0000 (13:58 -0700)
author Austin Clements <austin@google.com>
Wed, 30 Apr 2025 02:55:40 +0000 (22:55 -0400)
committer Gopher Robot <gobot@golang.org>
Tue, 5 Aug 2025 20:58:52 +0000 (13:58 -0700)
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go

index fa30efccb1efe32b0ce6494ed095554b1b24ed9d..1f55717f0a1a6019f91245b56805fa9fc8d9744f 100644 (file)
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -554,6 +554,8 @@ type G = g
  
  type Sudog = sudog
  
+type XRegPerG = xRegPerG
+
  func Getg() *G {
         return getg()
  }
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go

index 44015ce862d07777604f510ae500e9c38eefebe2..9821e4999899517a1eb158e7731dd929b6ce1cc3 100644 (file)
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -70,6 +70,7 @@ const (
         lockRankHchanLeaf
         // WB
         lockRankWbufSpans
+       lockRankXRegAlloc
         lockRankMheap
         lockRankMheapSpecial
         lockRankGlobalAlloc
@@ -143,6 +144,7 @@ var lockNames = []string{
         lockRankStackLarge:          "stackLarge",
         lockRankHchanLeaf:           "hchanLeaf",
         lockRankWbufSpans:           "wbufSpans",
+       lockRankXRegAlloc:           "xRegAlloc",
         lockRankMheap:               "mheap",
         lockRankMheapSpecial:        "mheapSpecial",
         lockRankGlobalAlloc:         "globalAlloc",
@@ -228,9 +230,10 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
         lockRankStackLarge:          {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
         lockRankHchanLeaf:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankHchanLeaf},
         lockRankWbufSpans:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
+       lockRankXRegAlloc:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankTimerSend, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched},
         lockRankMheap:               {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans},
         lockRankMheapSpecial:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
-       lockRankGlobalAlloc:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankMheapSpecial},
+       lockRankGlobalAlloc:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankXRegAlloc, lockRankMheap, lockRankMheapSpecial},
         lockRankTrace:               {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
         lockRankTraceStackTab:       {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankTrace},
         lockRankPanic:               {},
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go

index cb0d34004899ca7a8799db9aca346f29ed455c61..1776206573892fc6c2db88d6ebc4338ca130a57f 100644 (file)
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -821,6 +821,8 @@ func (h *mheap) init() {
         }
  
         h.pages.init(&h.lock, &memstats.gcMiscSys, false)
+
+       xRegInitAlloc()
  }
  
  // reclaim sweeps and reclaims at least npage pages into the heap.
diff --git a/src/runtime/mklockrank.go b/src/runtime/mklockrank.go

index 46a063fdce569ccd4e6022fb26fceb353a93e677..9c503369a35841b45f1ab262f2ecf290fdd62f58 100644 (file)
--- a/src/runtime/mklockrank.go
+++ b/src/runtime/mklockrank.go
@@ -193,6 +193,9 @@ defer,
  # Below WB is the write barrier implementation.
  < wbufSpans;
  
+# xRegState allocator
+sched < xRegAlloc;
+
  # Span allocator
  stackLarge,
    stackpool,
@@ -205,7 +208,8 @@ stackLarge,
  # an mspanSpecial lock, and they're part of the malloc implementation.
  # Pinner bits might be freed by the span allocator.
  mheap, mspanSpecial < mheapSpecial;
-mheap, mheapSpecial < globalAlloc;
+# Fixallocs
+mheap, mheapSpecial, xRegAlloc < globalAlloc;
  
  # Execution tracer events (with a P)
  hchan,
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go

index ec900a23d257e91415d05003b9687c062788b055..8fce40ccd8d04c96e7c6c37cf4f9368a6f0dff16 100644 (file)
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -9,8 +9,10 @@
  package main
  
  import (
+       "bytes"
         "flag"
         "fmt"
+       "go/format"
         "io"
         "log"
         "os"
@@ -122,14 +124,19 @@ type gen struct {
         goarch string
  }
  
-func (g *gen) asmHeader() {
+func (g *gen) commonHeader() {
         fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
         if beLe[g.goarch] {
                 base := g.goarch[:len(g.goarch)-1]
                 fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base)
         }
+}
+
+func (g *gen) asmHeader() {
+       g.commonHeader()
         fmt.Fprintf(g.w, "#include \"go_asm.h\"\n")
         if g.goarch == "amd64" {
+               fmt.Fprintf(g.w, "#include \"go_tls.h\"\n")
                 fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n")
         }
         fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n")
@@ -145,6 +152,43 @@ func (g *gen) label(l string) {
         fmt.Fprintf(g.w, "%s\n", l)
  }
  
+// writeXRegs writes an architecture xregs file.
+func writeXRegs(arch string, l *layout) {
+       var code bytes.Buffer
+       g := gen{&code, arch}
+       g.commonHeader()
+       fmt.Fprintf(g.w, `
+package runtime
+
+type xRegs struct {
+`)
+       pos := 0
+       for _, reg := range l.regs {
+               if reg.pos != pos {
+                       log.Fatalf("padding not implemented")
+               }
+               typ := fmt.Sprintf("[%d]byte", reg.size)
+               switch {
+               case reg.size == 4 && reg.pos%4 == 0:
+                       typ = "uint32"
+               case reg.size == 8 && reg.pos%8 == 0:
+                       typ = "uint64"
+               }
+               fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ)
+               pos += reg.size
+       }
+       fmt.Fprintf(g.w, "}\n")
+
+       path := fmt.Sprintf("preempt_%s.go", arch)
+       b, err := format.Source(code.Bytes())
+       if err != nil {
+               log.Fatalf("formatting %s: %s", path, err)
+       }
+       if err := os.WriteFile(path, b, 0666); err != nil {
+               log.Fatal(err)
+       }
+}
+
  type layout struct {
         stack int
         regs  []regPos
@@ -152,7 +196,7 @@ type layout struct {
  }
  
  type regPos struct {
-       pos int
+       pos, size int
  
         saveOp    string
         restoreOp string
@@ -165,17 +209,17 @@ type regPos struct {
  }
  
  func (l *layout) add(op, reg string, size int) {
-       l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack})
+       l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size})
         l.stack += size
  }
  
  func (l *layout) add2(sop, rop, reg string, size int) {
-       l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack})
+       l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size})
         l.stack += size
  }
  
  func (l *layout) addSpecial(save, restore string, size int) {
-       l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack})
+       l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size})
         l.stack += size
  }
  
@@ -239,6 +283,8 @@ func gen386(g *gen) {
  }
  
  func genAMD64(g *gen) {
+       const xReg = "AX" // *xRegState
+
         p := g.p
  
         // Assign stack offsets.
@@ -251,12 +297,13 @@ func genAMD64(g *gen) {
                         l.add("MOVQ", reg, 8)
                 }
         }
-       lSSE := layout{stack: l.stack, sp: "SP"}
+       lXRegs := layout{sp: xReg} // Non-GP registers
         for _, reg := range regNamesAMD64 {
                 if strings.HasPrefix(reg, "X") {
-                       lSSE.add("MOVUPS", reg, 16)
+                       lXRegs.add("MOVUPS", reg, 16)
                 }
         }
+       writeXRegs(g.goarch, &lXRegs)
  
         // TODO: MXCSR register?
  
@@ -265,17 +312,40 @@ func genAMD64(g *gen) {
         p("// Save flags before clobbering them")
         p("PUSHFQ")
         p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
-       p("ADJSP $%d", lSSE.stack)
+       p("ADJSP $%d", l.stack)
         p("// But vet doesn't know ADJSP, so suppress vet stack checking")
         p("NOP SP")
  
+       p("// Save GPs")
         l.save(g)
  
-       lSSE.save(g)
+       // In general, the limitations on asynchronous preemption mean we only
+       // preempt in ABIInternal code. However, there's at least one exception to
+       // this: when we're in an open-coded transition between an ABIInternal
+       // function and an ABI0 call. We could more carefully arrange unsafe points
+       // to avoid ever landing in ABI0, but it's easy to just make this code not
+       // sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will
+       // ensure we're in ABIInternal register state.
+       p("// Save extended register state to p.xRegs.scratch")
+       p("// Don't make assumptions about ABI register state. See mkpreempt.go")
+       p("get_tls(CX)")
+       p("MOVQ g(CX), R14")
+       p("MOVQ g_m(R14), %s", xReg)
+       p("MOVQ m_p(%s), %s", xReg, xReg)
+       p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)
+       lXRegs.save(g)
+
         p("CALL ·asyncPreempt2(SB)")
-       lSSE.restore(g)
+
+       p("// Restore non-GPs from *p.xRegs.cache")
+       p("MOVQ g_m(R14), %s", xReg)
+       p("MOVQ m_p(%s), %s", xReg, xReg)
+       p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
+       lXRegs.restore(g)
+
+       p("// Restore GPs")
         l.restore(g)
-       p("ADJSP $%d", -lSSE.stack)
+       p("ADJSP $%d", -l.stack)
         p("POPFQ")
         p("POPQ BP")
         p("RET")
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go

index c41c3558359c0cb625455392a6004173292f6bab..22727df74eead25eceaefa27d0d7946b2d81d775 100644 (file)
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -292,21 +292,52 @@ func canPreemptM(mp *m) bool {
  
  // asyncPreempt saves all user registers and calls asyncPreempt2.
  //
-// When stack scanning encounters an asyncPreempt frame, it scans that
+// It saves GP registers (anything that might contain a pointer) to the G stack.
+// Hence, when stack scanning encounters an asyncPreempt frame, it scans that
  // frame and its parent frame conservatively.
  //
+// On some platforms, it saves large additional scalar-only register state such
+// as vector registers to an "extended register state" on the P.
+//
  // asyncPreempt is implemented in assembly.
  func asyncPreempt()
  
+// asyncPreempt2 is the Go continuation of asyncPreempt.
+//
+// It must be deeply nosplit because there's untyped data on the stack from
+// asyncPreempt.
+//
+// It must not have any write barriers because we need to limit the amount of
+// stack it uses.
+//
  //go:nosplit
+//go:nowritebarrierrec
  func asyncPreempt2() {
+       // We can't grow the stack with untyped data from asyncPreempt, so switch to
+       // the system stack right away.
+       mcall(func(gp *g) {
+               gp.asyncSafePoint = true
+
+               // Move the extended register state from the P to the G. We do this now that
+               // we're on the system stack to avoid stack splits.
+               xRegSave(gp)
+
+               if gp.preemptStop {
+                       preemptPark(gp)
+               } else {
+                       gopreempt_m(gp)
+               }
+               // The above functions never return.
+       })
+
+       // Do not grow the stack below here!
+
         gp := getg()
-       gp.asyncSafePoint = true
-       if gp.preemptStop {
-               mcall(preemptPark)
-       } else {
-               mcall(gopreempt_m)
-       }
+
+       // Put the extended register state back on the M so resumption can find it.
+       // We can't do this in asyncPreemptM because the park calls never return.
+       xRegRestore(gp)
+
         gp.asyncSafePoint = false
  }
  
@@ -319,19 +350,13 @@ func init() {
         total := funcMaxSPDelta(f)
         f = findfunc(abi.FuncPCABIInternal(asyncPreempt2))
         total += funcMaxSPDelta(f)
+       f = findfunc(abi.FuncPCABIInternal(xRegRestore))
+       total += funcMaxSPDelta(f)
         // Add some overhead for return PCs, etc.
         asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize
         if asyncPreemptStack > stackNosplit {
-               // We need more than the nosplit limit. This isn't
-               // unsafe, but it may limit asynchronous preemption.
-               //
-               // This may be a problem if we start using more
-               // registers. In that case, we should store registers
-               // in a context object. If we pre-allocate one per P,
-               // asyncPreempt can spill just a few registers to the
-               // stack, then grab its context object and spill into
-               // it. When it enters the runtime, it would allocate a
-               // new context for the P.
+               // We need more than the nosplit limit. This isn't unsafe, but it may
+               // limit asynchronous preemption. Consider moving state into xRegState.
                 print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n")
                 throw("async stack too large")
         }
diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go

new file mode 100644 (file)

index 0000000..4a3ced7
--- /dev/null
+++ b/src/runtime/preempt_amd64.go
@@ -0,0 +1,22 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+package runtime
+
+type xRegs struct {
+       X0  [16]byte
+       X1  [16]byte
+       X2  [16]byte
+       X3  [16]byte
+       X4  [16]byte
+       X5  [16]byte
+       X6  [16]byte
+       X7  [16]byte
+       X8  [16]byte
+       X9  [16]byte
+       X10 [16]byte
+       X11 [16]byte
+       X12 [16]byte
+       X13 [16]byte
+       X14 [16]byte
+       X15 [16]byte
+}
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s

index 8e3ed0d7c59dcee445b3f89e116ff5bef1ead8cb..0a33ce7f3e72f5c8d935edb69e3d1cdc0eb3e5e6 100644 (file)
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -1,6 +1,7 @@
  // Code generated by mkpreempt.go; DO NOT EDIT.
  
  #include "go_asm.h"
+#include "go_tls.h"
  #include "asm_amd64.h"
  #include "textflag.h"
  
@@ -10,9 +11,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
         // Save flags before clobbering them
         PUSHFQ
         // obj doesn't understand ADD/SUB on SP, but does understand ADJSP
-       ADJSP $368
+       ADJSP $112
         // But vet doesn't know ADJSP, so suppress vet stack checking
         NOP SP
+       // Save GPs
         MOVQ AX, 0(SP)
         MOVQ CX, 8(SP)
         MOVQ DX, 16(SP)
@@ -27,39 +29,51 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
         MOVQ R13, 88(SP)
         MOVQ R14, 96(SP)
         MOVQ R15, 104(SP)
-       MOVUPS X0, 112(SP)
-       MOVUPS X1, 128(SP)
-       MOVUPS X2, 144(SP)
-       MOVUPS X3, 160(SP)
-       MOVUPS X4, 176(SP)
-       MOVUPS X5, 192(SP)
-       MOVUPS X6, 208(SP)
-       MOVUPS X7, 224(SP)
-       MOVUPS X8, 240(SP)
-       MOVUPS X9, 256(SP)
-       MOVUPS X10, 272(SP)
-       MOVUPS X11, 288(SP)
-       MOVUPS X12, 304(SP)
-       MOVUPS X13, 320(SP)
-       MOVUPS X14, 336(SP)
-       MOVUPS X15, 352(SP)
+       // Save extended register state to p.xRegs.scratch
+       // Don't make assumptions about ABI register state. See mkpreempt.go
+       get_tls(CX)
+       MOVQ g(CX), R14
+       MOVQ g_m(R14), AX
+       MOVQ m_p(AX), AX
+       LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX
+       MOVUPS X0, 0(AX)
+       MOVUPS X1, 16(AX)
+       MOVUPS X2, 32(AX)
+       MOVUPS X3, 48(AX)
+       MOVUPS X4, 64(AX)
+       MOVUPS X5, 80(AX)
+       MOVUPS X6, 96(AX)
+       MOVUPS X7, 112(AX)
+       MOVUPS X8, 128(AX)
+       MOVUPS X9, 144(AX)
+       MOVUPS X10, 160(AX)
+       MOVUPS X11, 176(AX)
+       MOVUPS X12, 192(AX)
+       MOVUPS X13, 208(AX)
+       MOVUPS X14, 224(AX)
+       MOVUPS X15, 240(AX)
         CALL ·asyncPreempt2(SB)
-       MOVUPS 352(SP), X15
-       MOVUPS 336(SP), X14
-       MOVUPS 320(SP), X13
-       MOVUPS 304(SP), X12
-       MOVUPS 288(SP), X11
-       MOVUPS 272(SP), X10
-       MOVUPS 256(SP), X9
-       MOVUPS 240(SP), X8
-       MOVUPS 224(SP), X7
-       MOVUPS 208(SP), X6
-       MOVUPS 192(SP), X5
-       MOVUPS 176(SP), X4
-       MOVUPS 160(SP), X3
-       MOVUPS 144(SP), X2
-       MOVUPS 128(SP), X1
-       MOVUPS 112(SP), X0
+       // Restore non-GPs from *p.xRegs.cache
+       MOVQ g_m(R14), AX
+       MOVQ m_p(AX), AX
+       MOVQ (p_xRegs+xRegPerP_cache)(AX), AX
+       MOVUPS 240(AX), X15
+       MOVUPS 224(AX), X14
+       MOVUPS 208(AX), X13
+       MOVUPS 192(AX), X12
+       MOVUPS 176(AX), X11
+       MOVUPS 160(AX), X10
+       MOVUPS 144(AX), X9
+       MOVUPS 128(AX), X8
+       MOVUPS 112(AX), X7
+       MOVUPS 96(AX), X6
+       MOVUPS 80(AX), X5
+       MOVUPS 64(AX), X4
+       MOVUPS 48(AX), X3
+       MOVUPS 32(AX), X2
+       MOVUPS 16(AX), X1
+       MOVUPS 0(AX), X0
+       // Restore GPs
         MOVQ 104(SP), R15
         MOVQ 96(SP), R14
         MOVQ 88(SP), R13
@@ -74,7 +88,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
         MOVQ 16(SP), DX
         MOVQ 8(SP), CX
         MOVQ 0(SP), AX
-       ADJSP $-368
+       ADJSP $-112
         POPFQ
         POPQ BP
         RET
diff --git a/src/runtime/preempt_noxreg.go b/src/runtime/preempt_noxreg.go

new file mode 100644 (file)

index 0000000..dfe4655
--- /dev/null
+++ b/src/runtime/preempt_noxreg.go
@@ -0,0 +1,27 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64
+
+// This provides common support for architectures that DO NOT use extended
+// register state in asynchronous preemption.
+
+package runtime
+
+type xRegPerG struct{}
+
+type xRegPerP struct{}
+
+// xRegState is defined only so the build fails if we try to define a real
+// xRegState on a noxreg architecture.
+type xRegState struct{}
+
+func xRegInitAlloc() {}
+
+func xRegSave(gp *g) {}
+
+//go:nosplit
+func xRegRestore(gp *g) {}
+
+func (*xRegPerP) free() {}
diff --git a/src/runtime/preempt_xreg.go b/src/runtime/preempt_xreg.go

new file mode 100644 (file)

index 0000000..9e05455
--- /dev/null
+++ b/src/runtime/preempt_xreg.go
@@ -0,0 +1,137 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+// This provides common support for architectures that use extended register
+// state in asynchronous preemption.
+//
+// While asynchronous preemption stores general-purpose (GP) registers on the
+// preempted goroutine's own stack, extended register state can be used to save
+// non-GP state off the stack. In particular, this is meant for large vector
+// register files. Currently, we assume this contains only scalar data, though
+// we could change this constraint by conservatively scanning this memory.
+//
+// For an architecture to support extended register state, it must provide a Go
+// definition of an xRegState type for storing the state, and its asyncPreempt
+// implementation must write this register state to p.xRegs.scratch.
+
+package runtime
+
+import (
+       "internal/runtime/sys"
+       "unsafe"
+)
+
+// xRegState is long-lived extended register state. It is allocated off-heap and
+// manually managed.
+type xRegState struct {
+       _    sys.NotInHeap // Allocated from xRegAlloc
+       regs xRegs
+}
+
+// xRegPerG stores extended register state while a goroutine is asynchronously
+// preempted. This is nil otherwise, so we can reuse a (likely small) pool of
+// xRegState objects.
+type xRegPerG struct {
+       state *xRegState
+}
+
+type xRegPerP struct {
+       // scratch temporary per-P space where [asyncPreempt] saves the register
+       // state before entering Go. It's quickly copied to per-G state.
+       scratch xRegs
+
+       // cache is a 1-element allocation cache of extended register state used by
+       // asynchronous preemption. On entry to preemption, this is used as a simple
+       // allocation cache. On exit from preemption, the G's xRegState is always
+       // stored here where it can be restored, and later either freed or reused
+       // for another preemption. On exit, this serves the dual purpose of
+       // delay-freeing the allocated xRegState until after we've definitely
+       // restored it.
+       cache *xRegState
+}
+
+// xRegAlloc allocates xRegState objects.
+var xRegAlloc struct {
+       lock  mutex
+       alloc fixalloc
+}
+
+func xRegInitAlloc() {
+       lockInit(&xRegAlloc.lock, lockRankXRegAlloc)
+       xRegAlloc.alloc.init(unsafe.Sizeof(xRegState{}), nil, nil, &memstats.other_sys)
+}
+
+// xRegSave saves the extended register state on this P to gp.
+//
+// This must run on the system stack because it assumes the P won't change.
+//
+//go:systemstack
+func xRegSave(gp *g) {
+       if gp.xRegs.state != nil {
+               // Double preempt?
+               throw("gp.xRegState.p != nil on async preempt")
+       }
+
+       // Get the place to save the register state.
+       var dest *xRegState
+       pp := gp.m.p.ptr()
+       if pp.xRegs.cache != nil {
+               // Use the cached allocation.
+               dest = pp.xRegs.cache
+               pp.xRegs.cache = nil
+       } else {
+               // Allocate a new save block.
+               lock(&xRegAlloc.lock)
+               dest = (*xRegState)(xRegAlloc.alloc.alloc())
+               unlock(&xRegAlloc.lock)
+       }
+
+       // Copy state saved in the scratchpad to dest.
+       //
+       // If we ever need to save less state (e.g., avoid saving vector registers
+       // that aren't in use), we could have multiple allocation pools for
+       // different size states and copy only the registers we need.
+       dest.regs = pp.xRegs.scratch
+
+       // Save on the G.
+       gp.xRegs.state = dest
+}
+
+// xRegRestore prepares the extended register state on gp to be restored.
+//
+// It moves the state to gp.m.p.xRegs.cache where [asyncPreempt] expects to find
+// it. This means nothing else may use the cache between this call and the
+// return to asyncPreempt. This is not quite symmetric with [xRegSave], which
+// uses gp.m.p.xRegs.scratch. By using cache instead, we save a block copy.
+//
+// This is called with asyncPreempt on the stack and thus must not grow the
+// stack.
+//
+//go:nosplit
+func xRegRestore(gp *g) {
+       if gp.xRegs.state == nil {
+               throw("gp.xRegState.p == nil on return from async preempt")
+       }
+       // If the P has a block cached on it, free that so we can replace it.
+       pp := gp.m.p.ptr()
+       if pp.xRegs.cache != nil {
+               // Don't grow the G stack.
+               systemstack(func() {
+                       pp.xRegs.free()
+               })
+       }
+       pp.xRegs.cache = gp.xRegs.state
+       gp.xRegs.state = nil
+}
+
+func (xRegs *xRegPerP) free() {
+       if xRegs.cache != nil {
+               lock(&xRegAlloc.lock)
+               xRegAlloc.alloc.free(unsafe.Pointer(xRegs.cache))
+               xRegs.cache = nil
+               unlock(&xRegAlloc.lock)
+       }
+}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index ec66384a75fa1bf8359ef1cdd2f5344473e17a73..25d39d9ba389ad695996712c6a354dfc3ccd6e1b 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -5838,6 +5838,7 @@ func (pp *p) destroy() {
         pp.gcAssistTime = 0
         gcCleanups.queued += pp.cleanupsQueued
         pp.cleanupsQueued = 0
+       pp.xRegs.free()
         pp.status = _Pdead
  }
  
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go

index 29e9b8a7b999ae90f4a8d4c4575d65851d72b3f7..b5d2dcefaded996dc6173d0d9a82b141611795d5 100644 (file)
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -492,6 +492,10 @@ type g struct {
         coroarg *coro // argument during coroutine transfers
         bubble  *synctestBubble
  
+       // xRegs stores the extended register state if this G has been
+       // asynchronously preempted.
+       xRegs xRegPerG
+
         // Per-G tracer state.
         trace gTraceState
  
@@ -760,6 +764,11 @@ type p struct {
         // gcStopTime is the nanotime timestamp that this P last entered _Pgcstop.
         gcStopTime int64
  
+       // xRegs is the per-P extended register state used by asynchronous
+       // preemption. This is an empty struct on platforms that don't use extended
+       // register state.
+       xRegs xRegPerP
+
         // Padding is no longer needed. False sharing is now not a worry because p is large enough
         // that its size class is an integer multiple of the cache line size (for any of our architectures).
  }
diff --git a/src/runtime/sizeof_test.go b/src/runtime/sizeof_test.go

index a5dc8aed3443bc0ae1066004f6522e4a037b11af..de859866a5adb2cdccd80a641223939c91231e94 100644 (file)
--- a/src/runtime/sizeof_test.go
+++ b/src/runtime/sizeof_test.go
@@ -15,13 +15,18 @@ import (
  
  func TestSizeof(t *testing.T) {
         const _64bit = unsafe.Sizeof(uintptr(0)) == 8
+       const xreg = unsafe.Sizeof(runtime.XRegPerG{}) // Varies per architecture
         var tests = []struct {
                 val    any     // type as a value
                 _32bit uintptr // size on 32bit platforms
                 _64bit uintptr // size on 64bit platforms
         }{
-               {runtime.G{}, 280, 440},   // g, but exported for testing
-               {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
+               {runtime.G{}, 280 + xreg, 440 + xreg}, // g, but exported for testing
+               {runtime.Sudog{}, 56, 88},             // sudog, but exported for testing
+       }
+
+       if xreg > runtime.PtrSize {
+               t.Errorf("unsafe.Sizeof(xRegPerG) = %d, want <= %d", xreg, runtime.PtrSize)
         }
  
         for _, tt := range tests {
author	Austin Clements <austin@google.com>
	Wed, 30 Apr 2025 02:55:40 +0000 (22:55 -0400)
committer	Gopher Robot <gobot@golang.org>
	Tue, 5 Aug 2025 20:58:52 +0000 (13:58 -0700)
src/runtime/export_test.go		patch \| blob \| history
src/runtime/lockrank.go		patch \| blob \| history
src/runtime/mheap.go		patch \| blob \| history
src/runtime/mklockrank.go		patch \| blob \| history
src/runtime/mkpreempt.go		patch \| blob \| history
src/runtime/preempt.go		patch \| blob \| history
src/runtime/preempt_amd64.go	[new file with mode: 0644]	patch \| blob
src/runtime/preempt_amd64.s		patch \| blob \| history
src/runtime/preempt_noxreg.go	[new file with mode: 0644]	patch \| blob
src/runtime/preempt_xreg.go	[new file with mode: 0644]	patch \| blob
src/runtime/proc.go		patch \| blob \| history
src/runtime/runtime2.go		patch \| blob \| history
src/runtime/sizeof_test.go		patch \| blob \| history