From 97c48fd93db95f56849395cf8c7447232d56607a Mon Sep 17 00:00:00 2001 From: Keith Randall Date: Mon, 9 Feb 2026 15:38:55 -0800 Subject: [PATCH] cmd/compile: move likely used values into registers before entering loop When we are about to enter a loop, we try to put values that will be used soon into registers, so we put them into registers once outside the loop instead of N times inside the loop. But we currently don't kick out values we won't use for a while to make room. This CL does that. So even if register pressure is large around the loop, we can still use registers for all the in-loop values. The values generating the register pressure, but not used inside the loop, will get spilled around the loop. This is particularly useful for Phi arguments. We want to promote from fixed zero registers or other rematerializeable values to a general register before the loop starts. Fixes #77299 Change-Id: I00efc3d3014163aaf377693830c7d7957eaa515f Reviewed-on: https://go-review.googlesource.com/c/go/+/743640 LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall Reviewed-by: Junyang Shao --- src/cmd/compile/internal/ssa/func_test.go | 5 ++ src/cmd/compile/internal/ssa/regalloc.go | 73 +++++++++++++++++-- src/cmd/compile/internal/ssa/regalloc_test.go | 59 +++++++++++++++ 3 files changed, 129 insertions(+), 8 deletions(-) diff --git a/src/cmd/compile/internal/ssa/func_test.go b/src/cmd/compile/internal/ssa/func_test.go index 1a378d4a95..479186b34d 100644 --- a/src/cmd/compile/internal/ssa/func_test.go +++ b/src/cmd/compile/internal/ssa/func_test.go @@ -260,6 +260,11 @@ func Eq(cond, sub, alt string) ctrl { return ctrl{BlockAMD64EQ, cond, []string{sub, alt}} } +// Lt specifies a BlockAMD64LT. +func Lt(cond, yes, no string) ctrl { + return ctrl{BlockAMD64LT, cond, []string{yes, no}} +} + // bloc, ctrl, and valu are internal structures used by Bloc, Valu, Goto, // If, and Exit to help define blocks. diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go index a0257f3064..dbcc93a7dd 100644 --- a/src/cmd/compile/internal/ssa/regalloc.go +++ b/src/cmd/compile/internal/ssa/regalloc.go @@ -2015,7 +2015,27 @@ func (s *regAllocState) regalloc(f *Func) { goto badloop } - // TODO: sort by distance, pick the closest ones? + // Look into target block, find Phi arguments that come from b. + phiArgs := regValLiveSet // reuse this space + phiArgs.clear() + for _, v := range b.Succs[0].b.Values { + if v.Op == OpPhi { + phiArgs.add(v.Args[b.Succs[0].i].ID) + } + } + + // Get mask of all registers that might be used soon in the destination. + // We don't want to kick values out of these registers, but we will + // kick out an unlikely-to-be-used value for a likely-to-be-used one. + var likelyUsedRegs regMask + for _, live := range s.live[b.ID] { + if live.dist < unlikelyDistance { + likelyUsedRegs |= s.values[live.ID].regs + } + } + // Promote values we're going to use soon in the destination to registers. + // Note that this iterates nearest-use first, as we sorted + // live lists by distance in computeLive. for _, live := range s.live[b.ID] { if live.dist >= unlikelyDistance { // Don't preload anything live after the loop. @@ -2023,14 +2043,41 @@ func (s *regAllocState) regalloc(f *Func) { } vid := live.ID vi := &s.values[vid] - if vi.regs != 0 { + v := s.orig[vid] + if phiArgs.contains(vid) { + // A phi argument needs its value in a regular register, + // as returned by compatRegs. Being in a fixed register + // (e.g. the zero register) or being easily + // rematerializeable isn't enough. + if vi.regs&s.compatRegs(v.Type) != 0 { + continue + } + } else { + if vi.regs != 0 { + continue + } + if vi.rematerializeable { + // TODO: maybe we should not skip rematerializeable + // values here. One rematerialization outside the loop + // is better than N in the loop. But rematerializations + // are cheap, and spilling another value may not be. + // And we don't want to materialize the zero register + // into a different register when it is just the + // argument to a store. + continue + } + } + if vi.rematerializeable && s.f.Config.ctxt.Arch.Arch == sys.ArchWasm { continue } - if vi.rematerializeable { + // Registers we could load v into. + // Don't kick out other likely-used values. + m := s.compatRegs(v.Type) &^ likelyUsedRegs + if m == 0 { + // To many likely-used values to give them all a register. continue } - v := s.orig[vid] - m := s.compatRegs(v.Type) &^ s.used + // Used desired register if available. outerloop: for _, e := range desired.entries { @@ -2047,9 +2094,8 @@ func (s *regAllocState) regalloc(f *Func) { if m&^desired.avoid != 0 { m &^= desired.avoid } - if m != 0 { - s.allocValToReg(v, m, false, b.Pos) - } + s.allocValToReg(v, m, false, b.Pos) + likelyUsedRegs |= s.values[v.ID].regs } } badloop: @@ -3129,6 +3175,17 @@ func (s *regAllocState) computeLive() { unfinishedBlocks = unfinishedBlocks[:n] } + // Sort live values in order of their nearest next use. + // Useful for promoting values to registers, nearest use first. + for _, b := range f.Blocks { + slices.SortFunc(s.live[b.ID], func(a, b liveInfo) int { + if a.dist != b.dist { + return cmp.Compare(a.dist, b.dist) + } + return cmp.Compare(a.ID, b.ID) // for deterministic sorting + }) + } + s.computeDesired() if f.pass.debug > regDebug { diff --git a/src/cmd/compile/internal/ssa/regalloc_test.go b/src/cmd/compile/internal/ssa/regalloc_test.go index 12f5820f1f..18d42063fb 100644 --- a/src/cmd/compile/internal/ssa/regalloc_test.go +++ b/src/cmd/compile/internal/ssa/regalloc_test.go @@ -346,3 +346,62 @@ func TestRematerializeableRegCompatible(t *testing.T) { t.Errorf("Expects an Copy to be issued, but got: %+v", f.f) } } + +func TestPreload(t *testing.T) { + c := testConfig(t) + // amd64 has 13 general registers. We use 1 for ptr and 12 for x0-11. + // They all contain live values at the end of the entry block. + f := c.Fun("entry", + Bloc("entry", + Valu("ptr", OpArgIntReg, c.config.Types.Int8.PtrTo(), 0, &AuxNameOffset{Name: c.Temp(c.config.Types.Int8.PtrTo()), Offset: 0}), + Valu("mem", OpInitMem, types.TypeMem, 0, nil), + Valu("x0", OpAMD64MOVBload, c.config.Types.Int8, 0, nil, "ptr", "mem"), + Valu("x1", OpAMD64MOVBload, c.config.Types.Int8, 1, nil, "ptr", "mem"), + Valu("x2", OpAMD64MOVBload, c.config.Types.Int8, 2, nil, "ptr", "mem"), + Valu("x3", OpAMD64MOVBload, c.config.Types.Int8, 3, nil, "ptr", "mem"), + Valu("x4", OpAMD64MOVBload, c.config.Types.Int8, 4, nil, "ptr", "mem"), + Valu("x5", OpAMD64MOVBload, c.config.Types.Int8, 5, nil, "ptr", "mem"), + Valu("x6", OpAMD64MOVBload, c.config.Types.Int8, 6, nil, "ptr", "mem"), + Valu("x7", OpAMD64MOVBload, c.config.Types.Int8, 7, nil, "ptr", "mem"), + Valu("x8", OpAMD64MOVBload, c.config.Types.Int8, 8, nil, "ptr", "mem"), + Valu("x9", OpAMD64MOVBload, c.config.Types.Int8, 9, nil, "ptr", "mem"), + Valu("x10", OpAMD64MOVBload, c.config.Types.Int8, 10, nil, "ptr", "mem"), + Valu("x11", OpAMD64MOVBload, c.config.Types.Int8, 11, nil, "ptr", "mem"), + Valu("init", OpAMD64MOVQconst, c.config.Types.Int64, 0, nil), + Goto("loopHead"), + ), + Bloc("loopHead", + Valu("phi", OpPhi, c.config.Types.Int64, 0, nil, "init", "next"), + Valu("test", OpAMD64CMPQconst, types.TypeFlags, 10, nil, "phi"), + Lt("test", "loopBody", "exit"), + ), + Bloc("loopBody", + Valu("next", OpAMD64ADDQconst, c.config.Types.Int64, 1, nil, "phi"), + Goto("loopHead"), + ), + Bloc("exit", + Valu("m0", OpAMD64MOVBstore, types.TypeMem, 0, nil, "ptr", "x0", "mem"), + Valu("m1", OpAMD64MOVBstore, types.TypeMem, 1, nil, "ptr", "x1", "m0"), + Valu("m2", OpAMD64MOVBstore, types.TypeMem, 2, nil, "ptr", "x2", "m1"), + Valu("m3", OpAMD64MOVBstore, types.TypeMem, 3, nil, "ptr", "x3", "m2"), + Valu("m4", OpAMD64MOVBstore, types.TypeMem, 4, nil, "ptr", "x4", "m3"), + Valu("m5", OpAMD64MOVBstore, types.TypeMem, 5, nil, "ptr", "x5", "m4"), + Valu("m6", OpAMD64MOVBstore, types.TypeMem, 6, nil, "ptr", "x6", "m5"), + Valu("m7", OpAMD64MOVBstore, types.TypeMem, 7, nil, "ptr", "x7", "m6"), + Valu("m8", OpAMD64MOVBstore, types.TypeMem, 8, nil, "ptr", "x8", "m7"), + Valu("m9", OpAMD64MOVBstore, types.TypeMem, 9, nil, "ptr", "x9", "m8"), + Valu("m10", OpAMD64MOVBstore, types.TypeMem, 10, nil, "ptr", "x10", "m9"), + Valu("m11", OpAMD64MOVBstore, types.TypeMem, 11, nil, "ptr", "x11", "m10"), + Ret("m11"), + ), + ) + f.f.Blocks[1].Likely = BranchLikely + regalloc(f.f) + checkFunc(f.f) + + v := f.values["phi"] + loc := f.f.RegAlloc[v.ID] + if _, ok := loc.(*Register); !ok { + t.Errorf("Expects to use a register for phi, but got: %s\n%v", loc, f.f) + } +} -- 2.52.0