From 955a5a0dc5dd68ed89200a08f17590c0a94c1e09 Mon Sep 17 00:00:00 2001 From: Julia Lapenko Date: Wed, 13 Aug 2025 22:23:14 +0300 Subject: [PATCH] runtime: support arm64 Neon in async preemption This is a port of CL 669195 adjusted to save arm64 Neon registers off stack. Change-Id: Ia014778a8c9f0c1d05977b04184f51e791ae8495 Reviewed-on: https://go-review.googlesource.com/c/go/+/695916 LUCI-TryBot-Result: Go LUCI Reviewed-by: Mark Freeman Reviewed-by: Cherry Mui --- src/runtime/mkpreempt.go | 204 +++++++++++++++++++++++++++------- src/runtime/preempt_arm64.go | 38 +++++++ src/runtime/preempt_arm64.s | 66 +++++------ src/runtime/preempt_noxreg.go | 2 +- src/runtime/preempt_xreg.go | 2 +- 5 files changed, 236 insertions(+), 76 deletions(-) create mode 100644 src/runtime/preempt_arm64.go diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 769c4ffc5c..7b84ba0a6f 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -163,19 +163,21 @@ package runtime type xRegs struct { `) pos := 0 - for _, reg := range l.regs { - if reg.pos != pos { - log.Fatalf("padding not implemented") - } - typ := fmt.Sprintf("[%d]byte", reg.size) - switch { - case reg.size == 4 && reg.pos%4 == 0: - typ = "uint32" - case reg.size == 8 && reg.pos%8 == 0: - typ = "uint64" + for _, seq := range l.regs { + for _, r := range seq.regs { + if r.pos != pos && !seq.fixedOffset { + log.Fatalf("padding not implemented") + } + typ := fmt.Sprintf("[%d]byte", r.size) + switch { + case r.size == 4 && r.pos%4 == 0: + typ = "uint32" + case r.size == 8 && r.pos%8 == 0: + typ = "uint64" + } + fmt.Fprintf(g.w, "\t%s %s\n", r.name, typ) + pos += r.size } - fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ) - pos += reg.size } fmt.Fprintf(g.w, "}\n") @@ -191,16 +193,61 @@ type xRegs struct { type layout struct { stack int - regs []regPos + regs []regSeq sp string // stack pointer register } -type regPos struct { - pos, size int +type regInfo struct { + size int // register size in bytes + name string // register name + + // Some register names may require a specific suffix. + // In ARM64, a suffix called an "arrangement specifier" can be added to + // a register name. For example: + // + // V0.B16 + // + // In this case, "V0" is the register name, and ".B16" is the suffix. + suffix string + pos int // position on stack +} + +// Some save/restore operations can involve multiple registers in a single +// instruction. For example, the LDP/STP instructions in ARM64: +// +// LDP 8(RSP), (R0, R1) +// STP (R0, R1), 8(RSP) +// +// In these cases, a pair of registers (R0, R1) is used as a single argument. +type regSeq struct { saveOp string restoreOp string - reg string + regs []regInfo + + // By default, all registers are saved on the stack, and the stack pointer offset + // is calculated based on the size of each register. For example (ARM64): + // + // STP (R0, R1), 8(RSP) + // STP (R2, R3), 24(RSP) + // + // However, automatic offset calculation may not always be desirable. + // In some cases, the offset must remain fixed: + // + // VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0) + // VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0) + // + // In this example, R0 is post-incremented after each instruction, + // so the offset should not be recalculated. For such cases, + // `fixedOffset` is set to true. + fixedOffset bool + + // After conversion to a string, register names are separated by commas + // and may be wrapped in a custom pair of brackets. For example (ARM64): + // + // (R0, R1) // wrapped in parentheses + // [V0.B16, V1.B16, V2.B16, V3.B16] // wrapped in square brackets + brackets [2]string // If this register requires special save and restore, these // give those operations with a %d placeholder for the stack @@ -208,42 +255,97 @@ type regPos struct { save, restore string } -func (l *layout) add(op, reg string, size int) { - l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size}) +func (l *layout) add(op, regname string, size int) { + l.regs = append(l.regs, regSeq{saveOp: op, restoreOp: op, regs: []regInfo{{size, regname, "", l.stack}}}) l.stack += size } -func (l *layout) add2(sop, rop, reg string, size int) { - l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size}) - l.stack += size +func (l *layout) add2(sop, rop string, regs []regInfo, brackets [2]string, fixedOffset bool) { + l.regs = append(l.regs, regSeq{saveOp: sop, restoreOp: rop, regs: regs, brackets: brackets, fixedOffset: fixedOffset}) + if !fixedOffset { + for i := range regs { + regs[i].pos = l.stack + l.stack += regs[i].size + } + } } func (l *layout) addSpecial(save, restore string, size int) { - l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size}) + l.regs = append(l.regs, regSeq{save: save, restore: restore, regs: []regInfo{{size, "", "", l.stack}}}) l.stack += size } +func (rs *regSeq) String() string { + switch len(rs.regs) { + case 0: + log.Fatal("Register sequence must not be empty!") + case 1: + return rs.regs[0].name + default: + names := make([]string, 0) + for _, r := range rs.regs { + name := r.name + r.suffix + names = append(names, name) + } + return rs.brackets[0] + strings.Join(names, ", ") + rs.brackets[1] + } + return "" +} + func (l *layout) save(g *gen) { - for _, reg := range l.regs { - if reg.save != "" { - g.p(reg.save, reg.pos) + for _, seq := range l.regs { + if len(seq.regs) < 1 { + log.Fatal("Register sequence must not be empty!") + } + // When dealing with a sequence of registers, we assume that only the position + // of the first register is relevant. For example: + // + // STP (R0, R1), 8(RSP) + // STP (R2, R3), 24(RSP) + // + // Here, R0.pos is 8. While we can infer that R1.pos is 16, it doesn't need to + // be explicitly specified, as the STP instruction calculates it automatically. + pos := seq.regs[0].pos + if seq.save != "" { + g.p(seq.save, pos) } else { - g.p("%s %s, %d(%s)", reg.saveOp, reg.reg, reg.pos, l.sp) + name := seq.String() + g.p("%s %s, %d(%s)", seq.saveOp, name, pos, l.sp) } } } -func (l *layout) restore(g *gen) { - for i := len(l.regs) - 1; i >= 0; i-- { - reg := l.regs[i] +func (l *layout) restoreInOrder(g *gen, reverse bool) { + var seq []regSeq + if reverse { + seq = make([]regSeq, 0) + for i := len(l.regs) - 1; i >= 0; i-- { + seq = append(seq, l.regs[i]) + } + } else { + seq = l.regs + } + for _, reg := range seq { + if len(reg.regs) < 1 { + log.Fatal("Register sequence must not be empty!") + } + pos := reg.regs[0].pos if reg.restore != "" { - g.p(reg.restore, reg.pos) + g.p(reg.restore, pos) } else { - g.p("%s %d(%s), %s", reg.restoreOp, reg.pos, l.sp, reg.reg) + g.p("%s %d(%s), %s", reg.restoreOp, pos, l.sp, reg.String()) } } } +func (l *layout) restore(g *gen) { + l.restoreInOrder(g, true) +} + +func (l *layout) restoreDirect(g *gen) { + l.restoreInOrder(g, false) +} + func gen386(g *gen) { p := g.p @@ -320,8 +422,11 @@ func genAMD64(g *gen) { // We don't have to do this, but it results in a nice Go type. If we split // this into multiple types, we probably should stop doing this. for i := range lXRegs.regs { - lXRegs.regs[i].pos = lZRegs.regs[i].pos - lYRegs.regs[i].pos = lZRegs.regs[i].pos + for j := range lXRegs.regs[i].regs { + lXRegs.regs[i].regs[j].pos = lZRegs.regs[i].regs[j].pos + lYRegs.regs[i].regs[j].pos = lZRegs.regs[i].regs[j].pos + } + } writeXRegs(g.goarch, &lZRegs) @@ -456,6 +561,7 @@ func genARM(g *gen) { } func genARM64(g *gen) { + const vReg = "R0" // *xRegState p := g.p // Add integer registers R0-R26 // R27 (REGTMP), R28 (g), R29 (FP), R30 (LR), R31 (SP) are special @@ -466,8 +572,11 @@ func genARM64(g *gen) { i-- continue // R18 is not used, skip } - reg := fmt.Sprintf("(R%d, R%d)", i, i+1) - l.add2("STP", "LDP", reg, 16) + regs := []regInfo{ + {name: fmt.Sprintf("R%d", i), size: 8}, + {name: fmt.Sprintf("R%d", i+1), size: 8}, + } + l.add2("STP", "LDP", regs, [2]string{"(", ")"}, false) } // Add flag registers. l.addSpecial( @@ -480,10 +589,17 @@ func genARM64(g *gen) { 8) // TODO: FPCR? I don't think we'll change it, so no need to save. // Add floating point registers F0-F31. - for i := 0; i < 31; i += 2 { - reg := fmt.Sprintf("(F%d, F%d)", i, i+1) - l.add2("FSTPD", "FLDPD", reg, 16) + lVRegs := layout{sp: vReg} // Non-GP registers + for i := 0; i < 31; i += 4 { + regs := []regInfo{ + {name: fmt.Sprintf("V%d", i), suffix: ".B16", size: 16, pos: 64}, + {name: fmt.Sprintf("V%d", i+1), suffix: ".B16", size: 16, pos: 64}, + {name: fmt.Sprintf("V%d", i+2), suffix: ".B16", size: 16, pos: 64}, + {name: fmt.Sprintf("V%d", i+3), suffix: ".B16", size: 16, pos: 64}, + } + lVRegs.add2("VST1.P", "VLD1.P", regs, [2]string{"[", "]"}, true) } + writeXRegs(g.goarch, &lVRegs) if l.stack%16 != 0 { l.stack += 8 // SP needs 16-byte alignment } @@ -500,8 +616,20 @@ func genARM64(g *gen) { p("MOVD R30, (RSP)") p("#endif") + p("// Save GPs") l.save(g) + p("// Save extended register state to p.xRegs.scratch") + p("MOVD g_m(g), %s", vReg) + p("MOVD m_p(%s), %s", vReg, vReg) + p("ADD $(p_xRegs+xRegPerP_scratch), %s, %s", vReg, vReg) + lVRegs.save(g) p("CALL ·asyncPreempt2(SB)") + p("// Restore non-GPs from *p.xRegs.cache") + p("MOVD g_m(g), %s", vReg) + p("MOVD m_p(%s), %s", vReg, vReg) + p("MOVD (p_xRegs+xRegPerP_cache)(%s), %s", vReg, vReg) + lVRegs.restoreDirect(g) + p("// Restore GPs") l.restore(g) p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it diff --git a/src/runtime/preempt_arm64.go b/src/runtime/preempt_arm64.go new file mode 100644 index 0000000000..1b71d2713e --- /dev/null +++ b/src/runtime/preempt_arm64.go @@ -0,0 +1,38 @@ +// Code generated by mkpreempt.go; DO NOT EDIT. + +package runtime + +type xRegs struct { + V0 [16]byte + V1 [16]byte + V2 [16]byte + V3 [16]byte + V4 [16]byte + V5 [16]byte + V6 [16]byte + V7 [16]byte + V8 [16]byte + V9 [16]byte + V10 [16]byte + V11 [16]byte + V12 [16]byte + V13 [16]byte + V14 [16]byte + V15 [16]byte + V16 [16]byte + V17 [16]byte + V18 [16]byte + V19 [16]byte + V20 [16]byte + V21 [16]byte + V22 [16]byte + V23 [16]byte + V24 [16]byte + V25 [16]byte + V26 [16]byte + V27 [16]byte + V28 [16]byte + V29 [16]byte + V30 [16]byte + V31 [16]byte +} diff --git a/src/runtime/preempt_arm64.s b/src/runtime/preempt_arm64.s index 31ec9d940f..9017d88159 100644 --- a/src/runtime/preempt_arm64.s +++ b/src/runtime/preempt_arm64.s @@ -4,13 +4,14 @@ #include "textflag.h" TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 - MOVD R30, -496(RSP) - SUB $496, RSP + MOVD R30, -240(RSP) + SUB $240, RSP MOVD R29, -8(RSP) SUB $8, RSP, R29 #ifdef GOOS_ios MOVD R30, (RSP) #endif + // Save GPs STP (R0, R1), 8(RSP) STP (R2, R3), 24(RSP) STP (R4, R5), 40(RSP) @@ -28,39 +29,32 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVD R0, 216(RSP) MOVD FPSR, R0 MOVD R0, 224(RSP) - FSTPD (F0, F1), 232(RSP) - FSTPD (F2, F3), 248(RSP) - FSTPD (F4, F5), 264(RSP) - FSTPD (F6, F7), 280(RSP) - FSTPD (F8, F9), 296(RSP) - FSTPD (F10, F11), 312(RSP) - FSTPD (F12, F13), 328(RSP) - FSTPD (F14, F15), 344(RSP) - FSTPD (F16, F17), 360(RSP) - FSTPD (F18, F19), 376(RSP) - FSTPD (F20, F21), 392(RSP) - FSTPD (F22, F23), 408(RSP) - FSTPD (F24, F25), 424(RSP) - FSTPD (F26, F27), 440(RSP) - FSTPD (F28, F29), 456(RSP) - FSTPD (F30, F31), 472(RSP) + // Save extended register state to p.xRegs.scratch + MOVD g_m(g), R0 + MOVD m_p(R0), R0 + ADD $(p_xRegs+xRegPerP_scratch), R0, R0 + VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0) + VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0) + VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R0) + VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0) + VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0) + VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0) + VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R0) + VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R0) CALL ·asyncPreempt2(SB) - FLDPD 472(RSP), (F30, F31) - FLDPD 456(RSP), (F28, F29) - FLDPD 440(RSP), (F26, F27) - FLDPD 424(RSP), (F24, F25) - FLDPD 408(RSP), (F22, F23) - FLDPD 392(RSP), (F20, F21) - FLDPD 376(RSP), (F18, F19) - FLDPD 360(RSP), (F16, F17) - FLDPD 344(RSP), (F14, F15) - FLDPD 328(RSP), (F12, F13) - FLDPD 312(RSP), (F10, F11) - FLDPD 296(RSP), (F8, F9) - FLDPD 280(RSP), (F6, F7) - FLDPD 264(RSP), (F4, F5) - FLDPD 248(RSP), (F2, F3) - FLDPD 232(RSP), (F0, F1) + // Restore non-GPs from *p.xRegs.cache + MOVD g_m(g), R0 + MOVD m_p(R0), R0 + MOVD (p_xRegs+xRegPerP_cache)(R0), R0 + VLD1.P 64(R0), [V0.B16, V1.B16, V2.B16, V3.B16] + VLD1.P 64(R0), [V4.B16, V5.B16, V6.B16, V7.B16] + VLD1.P 64(R0), [V8.B16, V9.B16, V10.B16, V11.B16] + VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16] + VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16] + VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16] + VLD1.P 64(R0), [V24.B16, V25.B16, V26.B16, V27.B16] + VLD1.P 64(R0), [V28.B16, V29.B16, V30.B16, V31.B16] + // Restore GPs MOVD 224(RSP), R0 MOVD R0, FPSR MOVD 216(RSP), R0 @@ -78,8 +72,8 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 LDP 40(RSP), (R4, R5) LDP 24(RSP), (R2, R3) LDP 8(RSP), (R0, R1) - MOVD 496(RSP), R30 + MOVD 240(RSP), R30 MOVD -8(RSP), R29 MOVD (RSP), R27 - ADD $512, RSP + ADD $256, RSP RET (R27) diff --git a/src/runtime/preempt_noxreg.go b/src/runtime/preempt_noxreg.go index dfe46559b5..9f03b2b333 100644 --- a/src/runtime/preempt_noxreg.go +++ b/src/runtime/preempt_noxreg.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 +//go:build !amd64 && !arm64 // This provides common support for architectures that DO NOT use extended // register state in asynchronous preemption. diff --git a/src/runtime/preempt_xreg.go b/src/runtime/preempt_xreg.go index 9e05455ddb..f4578a4d76 100644 --- a/src/runtime/preempt_xreg.go +++ b/src/runtime/preempt_xreg.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build amd64 +//go:build amd64 || arm64 // This provides common support for architectures that use extended register // state in asynchronous preemption. -- 2.52.0