type Sudog = sudog
+type XRegPerG = xRegPerG
+
func Getg() *G {
return getg()
}
lockRankHchanLeaf
// WB
lockRankWbufSpans
+ lockRankXRegAlloc
lockRankMheap
lockRankMheapSpecial
lockRankGlobalAlloc
lockRankStackLarge: "stackLarge",
lockRankHchanLeaf: "hchanLeaf",
lockRankWbufSpans: "wbufSpans",
+ lockRankXRegAlloc: "xRegAlloc",
lockRankMheap: "mheap",
lockRankMheapSpecial: "mheapSpecial",
lockRankGlobalAlloc: "globalAlloc",
lockRankStackLarge: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
lockRankHchanLeaf: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankHchanLeaf},
lockRankWbufSpans: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
+ lockRankXRegAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankTimerSend, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched},
lockRankMheap: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans},
lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
- lockRankGlobalAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankMheapSpecial},
+ lockRankGlobalAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankXRegAlloc, lockRankMheap, lockRankMheapSpecial},
lockRankTrace: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
lockRankTraceStackTab: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankTrace},
lockRankPanic: {},
}
h.pages.init(&h.lock, &memstats.gcMiscSys, false)
+
+ xRegInitAlloc()
}
// reclaim sweeps and reclaims at least npage pages into the heap.
# Below WB is the write barrier implementation.
< wbufSpans;
+# xRegState allocator
+sched < xRegAlloc;
+
# Span allocator
stackLarge,
stackpool,
# an mspanSpecial lock, and they're part of the malloc implementation.
# Pinner bits might be freed by the span allocator.
mheap, mspanSpecial < mheapSpecial;
-mheap, mheapSpecial < globalAlloc;
+# Fixallocs
+mheap, mheapSpecial, xRegAlloc < globalAlloc;
# Execution tracer events (with a P)
hchan,
package main
import (
+ "bytes"
"flag"
"fmt"
+ "go/format"
"io"
"log"
"os"
goarch string
}
-func (g *gen) asmHeader() {
+func (g *gen) commonHeader() {
fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
if beLe[g.goarch] {
base := g.goarch[:len(g.goarch)-1]
fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base)
}
+}
+
+func (g *gen) asmHeader() {
+ g.commonHeader()
fmt.Fprintf(g.w, "#include \"go_asm.h\"\n")
if g.goarch == "amd64" {
+ fmt.Fprintf(g.w, "#include \"go_tls.h\"\n")
fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n")
}
fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n")
fmt.Fprintf(g.w, "%s\n", l)
}
+// writeXRegs writes an architecture xregs file.
+func writeXRegs(arch string, l *layout) {
+ var code bytes.Buffer
+ g := gen{&code, arch}
+ g.commonHeader()
+ fmt.Fprintf(g.w, `
+package runtime
+
+type xRegs struct {
+`)
+ pos := 0
+ for _, reg := range l.regs {
+ if reg.pos != pos {
+ log.Fatalf("padding not implemented")
+ }
+ typ := fmt.Sprintf("[%d]byte", reg.size)
+ switch {
+ case reg.size == 4 && reg.pos%4 == 0:
+ typ = "uint32"
+ case reg.size == 8 && reg.pos%8 == 0:
+ typ = "uint64"
+ }
+ fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ)
+ pos += reg.size
+ }
+ fmt.Fprintf(g.w, "}\n")
+
+ path := fmt.Sprintf("preempt_%s.go", arch)
+ b, err := format.Source(code.Bytes())
+ if err != nil {
+ log.Fatalf("formatting %s: %s", path, err)
+ }
+ if err := os.WriteFile(path, b, 0666); err != nil {
+ log.Fatal(err)
+ }
+}
+
type layout struct {
stack int
regs []regPos
}
type regPos struct {
- pos int
+ pos, size int
saveOp string
restoreOp string
}
func (l *layout) add(op, reg string, size int) {
- l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack})
+ l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size})
l.stack += size
}
func (l *layout) add2(sop, rop, reg string, size int) {
- l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack})
+ l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size})
l.stack += size
}
func (l *layout) addSpecial(save, restore string, size int) {
- l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack})
+ l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size})
l.stack += size
}
}
func genAMD64(g *gen) {
+ const xReg = "AX" // *xRegState
+
p := g.p
// Assign stack offsets.
l.add("MOVQ", reg, 8)
}
}
- lSSE := layout{stack: l.stack, sp: "SP"}
+ lXRegs := layout{sp: xReg} // Non-GP registers
for _, reg := range regNamesAMD64 {
if strings.HasPrefix(reg, "X") {
- lSSE.add("MOVUPS", reg, 16)
+ lXRegs.add("MOVUPS", reg, 16)
}
}
+ writeXRegs(g.goarch, &lXRegs)
// TODO: MXCSR register?
p("// Save flags before clobbering them")
p("PUSHFQ")
p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
- p("ADJSP $%d", lSSE.stack)
+ p("ADJSP $%d", l.stack)
p("// But vet doesn't know ADJSP, so suppress vet stack checking")
p("NOP SP")
+ p("// Save GPs")
l.save(g)
- lSSE.save(g)
+ // In general, the limitations on asynchronous preemption mean we only
+ // preempt in ABIInternal code. However, there's at least one exception to
+ // this: when we're in an open-coded transition between an ABIInternal
+ // function and an ABI0 call. We could more carefully arrange unsafe points
+ // to avoid ever landing in ABI0, but it's easy to just make this code not
+ // sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will
+ // ensure we're in ABIInternal register state.
+ p("// Save extended register state to p.xRegs.scratch")
+ p("// Don't make assumptions about ABI register state. See mkpreempt.go")
+ p("get_tls(CX)")
+ p("MOVQ g(CX), R14")
+ p("MOVQ g_m(R14), %s", xReg)
+ p("MOVQ m_p(%s), %s", xReg, xReg)
+ p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)
+ lXRegs.save(g)
+
p("CALL ·asyncPreempt2(SB)")
- lSSE.restore(g)
+
+ p("// Restore non-GPs from *p.xRegs.cache")
+ p("MOVQ g_m(R14), %s", xReg)
+ p("MOVQ m_p(%s), %s", xReg, xReg)
+ p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
+ lXRegs.restore(g)
+
+ p("// Restore GPs")
l.restore(g)
- p("ADJSP $%d", -lSSE.stack)
+ p("ADJSP $%d", -l.stack)
p("POPFQ")
p("POPQ BP")
p("RET")
// asyncPreempt saves all user registers and calls asyncPreempt2.
//
-// When stack scanning encounters an asyncPreempt frame, it scans that
+// It saves GP registers (anything that might contain a pointer) to the G stack.
+// Hence, when stack scanning encounters an asyncPreempt frame, it scans that
// frame and its parent frame conservatively.
//
+// On some platforms, it saves large additional scalar-only register state such
+// as vector registers to an "extended register state" on the P.
+//
// asyncPreempt is implemented in assembly.
func asyncPreempt()
+// asyncPreempt2 is the Go continuation of asyncPreempt.
+//
+// It must be deeply nosplit because there's untyped data on the stack from
+// asyncPreempt.
+//
+// It must not have any write barriers because we need to limit the amount of
+// stack it uses.
+//
//go:nosplit
+//go:nowritebarrierrec
func asyncPreempt2() {
+ // We can't grow the stack with untyped data from asyncPreempt, so switch to
+ // the system stack right away.
+ mcall(func(gp *g) {
+ gp.asyncSafePoint = true
+
+ // Move the extended register state from the P to the G. We do this now that
+ // we're on the system stack to avoid stack splits.
+ xRegSave(gp)
+
+ if gp.preemptStop {
+ preemptPark(gp)
+ } else {
+ gopreempt_m(gp)
+ }
+ // The above functions never return.
+ })
+
+ // Do not grow the stack below here!
+
gp := getg()
- gp.asyncSafePoint = true
- if gp.preemptStop {
- mcall(preemptPark)
- } else {
- mcall(gopreempt_m)
- }
+
+ // Put the extended register state back on the M so resumption can find it.
+ // We can't do this in asyncPreemptM because the park calls never return.
+ xRegRestore(gp)
+
gp.asyncSafePoint = false
}
total := funcMaxSPDelta(f)
f = findfunc(abi.FuncPCABIInternal(asyncPreempt2))
total += funcMaxSPDelta(f)
+ f = findfunc(abi.FuncPCABIInternal(xRegRestore))
+ total += funcMaxSPDelta(f)
// Add some overhead for return PCs, etc.
asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize
if asyncPreemptStack > stackNosplit {
- // We need more than the nosplit limit. This isn't
- // unsafe, but it may limit asynchronous preemption.
- //
- // This may be a problem if we start using more
- // registers. In that case, we should store registers
- // in a context object. If we pre-allocate one per P,
- // asyncPreempt can spill just a few registers to the
- // stack, then grab its context object and spill into
- // it. When it enters the runtime, it would allocate a
- // new context for the P.
+ // We need more than the nosplit limit. This isn't unsafe, but it may
+ // limit asynchronous preemption. Consider moving state into xRegState.
print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n")
throw("async stack too large")
}
--- /dev/null
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+package runtime
+
+type xRegs struct {
+ X0 [16]byte
+ X1 [16]byte
+ X2 [16]byte
+ X3 [16]byte
+ X4 [16]byte
+ X5 [16]byte
+ X6 [16]byte
+ X7 [16]byte
+ X8 [16]byte
+ X9 [16]byte
+ X10 [16]byte
+ X11 [16]byte
+ X12 [16]byte
+ X13 [16]byte
+ X14 [16]byte
+ X15 [16]byte
+}
// Code generated by mkpreempt.go; DO NOT EDIT.
#include "go_asm.h"
+#include "go_tls.h"
#include "asm_amd64.h"
#include "textflag.h"
// Save flags before clobbering them
PUSHFQ
// obj doesn't understand ADD/SUB on SP, but does understand ADJSP
- ADJSP $368
+ ADJSP $112
// But vet doesn't know ADJSP, so suppress vet stack checking
NOP SP
+ // Save GPs
MOVQ AX, 0(SP)
MOVQ CX, 8(SP)
MOVQ DX, 16(SP)
MOVQ R13, 88(SP)
MOVQ R14, 96(SP)
MOVQ R15, 104(SP)
- MOVUPS X0, 112(SP)
- MOVUPS X1, 128(SP)
- MOVUPS X2, 144(SP)
- MOVUPS X3, 160(SP)
- MOVUPS X4, 176(SP)
- MOVUPS X5, 192(SP)
- MOVUPS X6, 208(SP)
- MOVUPS X7, 224(SP)
- MOVUPS X8, 240(SP)
- MOVUPS X9, 256(SP)
- MOVUPS X10, 272(SP)
- MOVUPS X11, 288(SP)
- MOVUPS X12, 304(SP)
- MOVUPS X13, 320(SP)
- MOVUPS X14, 336(SP)
- MOVUPS X15, 352(SP)
+ // Save extended register state to p.xRegs.scratch
+ // Don't make assumptions about ABI register state. See mkpreempt.go
+ get_tls(CX)
+ MOVQ g(CX), R14
+ MOVQ g_m(R14), AX
+ MOVQ m_p(AX), AX
+ LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX
+ MOVUPS X0, 0(AX)
+ MOVUPS X1, 16(AX)
+ MOVUPS X2, 32(AX)
+ MOVUPS X3, 48(AX)
+ MOVUPS X4, 64(AX)
+ MOVUPS X5, 80(AX)
+ MOVUPS X6, 96(AX)
+ MOVUPS X7, 112(AX)
+ MOVUPS X8, 128(AX)
+ MOVUPS X9, 144(AX)
+ MOVUPS X10, 160(AX)
+ MOVUPS X11, 176(AX)
+ MOVUPS X12, 192(AX)
+ MOVUPS X13, 208(AX)
+ MOVUPS X14, 224(AX)
+ MOVUPS X15, 240(AX)
CALL ·asyncPreempt2(SB)
- MOVUPS 352(SP), X15
- MOVUPS 336(SP), X14
- MOVUPS 320(SP), X13
- MOVUPS 304(SP), X12
- MOVUPS 288(SP), X11
- MOVUPS 272(SP), X10
- MOVUPS 256(SP), X9
- MOVUPS 240(SP), X8
- MOVUPS 224(SP), X7
- MOVUPS 208(SP), X6
- MOVUPS 192(SP), X5
- MOVUPS 176(SP), X4
- MOVUPS 160(SP), X3
- MOVUPS 144(SP), X2
- MOVUPS 128(SP), X1
- MOVUPS 112(SP), X0
+ // Restore non-GPs from *p.xRegs.cache
+ MOVQ g_m(R14), AX
+ MOVQ m_p(AX), AX
+ MOVQ (p_xRegs+xRegPerP_cache)(AX), AX
+ MOVUPS 240(AX), X15
+ MOVUPS 224(AX), X14
+ MOVUPS 208(AX), X13
+ MOVUPS 192(AX), X12
+ MOVUPS 176(AX), X11
+ MOVUPS 160(AX), X10
+ MOVUPS 144(AX), X9
+ MOVUPS 128(AX), X8
+ MOVUPS 112(AX), X7
+ MOVUPS 96(AX), X6
+ MOVUPS 80(AX), X5
+ MOVUPS 64(AX), X4
+ MOVUPS 48(AX), X3
+ MOVUPS 32(AX), X2
+ MOVUPS 16(AX), X1
+ MOVUPS 0(AX), X0
+ // Restore GPs
MOVQ 104(SP), R15
MOVQ 96(SP), R14
MOVQ 88(SP), R13
MOVQ 16(SP), DX
MOVQ 8(SP), CX
MOVQ 0(SP), AX
- ADJSP $-368
+ ADJSP $-112
POPFQ
POPQ BP
RET
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64
+
+// This provides common support for architectures that DO NOT use extended
+// register state in asynchronous preemption.
+
+package runtime
+
+type xRegPerG struct{}
+
+type xRegPerP struct{}
+
+// xRegState is defined only so the build fails if we try to define a real
+// xRegState on a noxreg architecture.
+type xRegState struct{}
+
+func xRegInitAlloc() {}
+
+func xRegSave(gp *g) {}
+
+//go:nosplit
+func xRegRestore(gp *g) {}
+
+func (*xRegPerP) free() {}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+// This provides common support for architectures that use extended register
+// state in asynchronous preemption.
+//
+// While asynchronous preemption stores general-purpose (GP) registers on the
+// preempted goroutine's own stack, extended register state can be used to save
+// non-GP state off the stack. In particular, this is meant for large vector
+// register files. Currently, we assume this contains only scalar data, though
+// we could change this constraint by conservatively scanning this memory.
+//
+// For an architecture to support extended register state, it must provide a Go
+// definition of an xRegState type for storing the state, and its asyncPreempt
+// implementation must write this register state to p.xRegs.scratch.
+
+package runtime
+
+import (
+ "internal/runtime/sys"
+ "unsafe"
+)
+
+// xRegState is long-lived extended register state. It is allocated off-heap and
+// manually managed.
+type xRegState struct {
+ _ sys.NotInHeap // Allocated from xRegAlloc
+ regs xRegs
+}
+
+// xRegPerG stores extended register state while a goroutine is asynchronously
+// preempted. This is nil otherwise, so we can reuse a (likely small) pool of
+// xRegState objects.
+type xRegPerG struct {
+ state *xRegState
+}
+
+type xRegPerP struct {
+ // scratch temporary per-P space where [asyncPreempt] saves the register
+ // state before entering Go. It's quickly copied to per-G state.
+ scratch xRegs
+
+ // cache is a 1-element allocation cache of extended register state used by
+ // asynchronous preemption. On entry to preemption, this is used as a simple
+ // allocation cache. On exit from preemption, the G's xRegState is always
+ // stored here where it can be restored, and later either freed or reused
+ // for another preemption. On exit, this serves the dual purpose of
+ // delay-freeing the allocated xRegState until after we've definitely
+ // restored it.
+ cache *xRegState
+}
+
+// xRegAlloc allocates xRegState objects.
+var xRegAlloc struct {
+ lock mutex
+ alloc fixalloc
+}
+
+func xRegInitAlloc() {
+ lockInit(&xRegAlloc.lock, lockRankXRegAlloc)
+ xRegAlloc.alloc.init(unsafe.Sizeof(xRegState{}), nil, nil, &memstats.other_sys)
+}
+
+// xRegSave saves the extended register state on this P to gp.
+//
+// This must run on the system stack because it assumes the P won't change.
+//
+//go:systemstack
+func xRegSave(gp *g) {
+ if gp.xRegs.state != nil {
+ // Double preempt?
+ throw("gp.xRegState.p != nil on async preempt")
+ }
+
+ // Get the place to save the register state.
+ var dest *xRegState
+ pp := gp.m.p.ptr()
+ if pp.xRegs.cache != nil {
+ // Use the cached allocation.
+ dest = pp.xRegs.cache
+ pp.xRegs.cache = nil
+ } else {
+ // Allocate a new save block.
+ lock(&xRegAlloc.lock)
+ dest = (*xRegState)(xRegAlloc.alloc.alloc())
+ unlock(&xRegAlloc.lock)
+ }
+
+ // Copy state saved in the scratchpad to dest.
+ //
+ // If we ever need to save less state (e.g., avoid saving vector registers
+ // that aren't in use), we could have multiple allocation pools for
+ // different size states and copy only the registers we need.
+ dest.regs = pp.xRegs.scratch
+
+ // Save on the G.
+ gp.xRegs.state = dest
+}
+
+// xRegRestore prepares the extended register state on gp to be restored.
+//
+// It moves the state to gp.m.p.xRegs.cache where [asyncPreempt] expects to find
+// it. This means nothing else may use the cache between this call and the
+// return to asyncPreempt. This is not quite symmetric with [xRegSave], which
+// uses gp.m.p.xRegs.scratch. By using cache instead, we save a block copy.
+//
+// This is called with asyncPreempt on the stack and thus must not grow the
+// stack.
+//
+//go:nosplit
+func xRegRestore(gp *g) {
+ if gp.xRegs.state == nil {
+ throw("gp.xRegState.p == nil on return from async preempt")
+ }
+ // If the P has a block cached on it, free that so we can replace it.
+ pp := gp.m.p.ptr()
+ if pp.xRegs.cache != nil {
+ // Don't grow the G stack.
+ systemstack(func() {
+ pp.xRegs.free()
+ })
+ }
+ pp.xRegs.cache = gp.xRegs.state
+ gp.xRegs.state = nil
+}
+
+func (xRegs *xRegPerP) free() {
+ if xRegs.cache != nil {
+ lock(&xRegAlloc.lock)
+ xRegAlloc.alloc.free(unsafe.Pointer(xRegs.cache))
+ xRegs.cache = nil
+ unlock(&xRegAlloc.lock)
+ }
+}
pp.gcAssistTime = 0
gcCleanups.queued += pp.cleanupsQueued
pp.cleanupsQueued = 0
+ pp.xRegs.free()
pp.status = _Pdead
}
coroarg *coro // argument during coroutine transfers
bubble *synctestBubble
+ // xRegs stores the extended register state if this G has been
+ // asynchronously preempted.
+ xRegs xRegPerG
+
// Per-G tracer state.
trace gTraceState
// gcStopTime is the nanotime timestamp that this P last entered _Pgcstop.
gcStopTime int64
+ // xRegs is the per-P extended register state used by asynchronous
+ // preemption. This is an empty struct on platforms that don't use extended
+ // register state.
+ xRegs xRegPerP
+
// Padding is no longer needed. False sharing is now not a worry because p is large enough
// that its size class is an integer multiple of the cache line size (for any of our architectures).
}
func TestSizeof(t *testing.T) {
const _64bit = unsafe.Sizeof(uintptr(0)) == 8
+ const xreg = unsafe.Sizeof(runtime.XRegPerG{}) // Varies per architecture
var tests = []struct {
val any // type as a value
_32bit uintptr // size on 32bit platforms
_64bit uintptr // size on 64bit platforms
}{
- {runtime.G{}, 280, 440}, // g, but exported for testing
- {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
+ {runtime.G{}, 280 + xreg, 440 + xreg}, // g, but exported for testing
+ {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
+ }
+
+ if xreg > runtime.PtrSize {
+ t.Errorf("unsafe.Sizeof(xRegPerG) = %d, want <= %d", xreg, runtime.PtrSize)
}
for _, tt := range tests {