From 4e761b9a181a090ca9ec52916c4aef93c4cbc090 Mon Sep 17 00:00:00 2001 From: Daniel Morsing Date: Wed, 5 Nov 2025 18:33:44 +0000 Subject: [PATCH] cmd/compile: optimize liveness in stackalloc The stackalloc code needs to run a liveness pass to build the interference graph between stack slots. Because the values that we need liveness over is so sparse, we can optimize the analysis by using a path exploration algorithm rather than a iterative dataflow one In local testing, this cuts 74.05 ms of CPU time off a build of cmd/compile. Change-Id: I765ace87d5e8aae177e65eb63da482e3d698bea7 Reviewed-on: https://go-review.googlesource.com/c/go/+/718540 Reviewed-by: Keith Randall Auto-Submit: Keith Randall Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall --- src/cmd/compile/internal/ssa/stackalloc.go | 167 +++++++++++++-------- 1 file changed, 101 insertions(+), 66 deletions(-) diff --git a/src/cmd/compile/internal/ssa/stackalloc.go b/src/cmd/compile/internal/ssa/stackalloc.go index 11ffe5b55e..06097e95da 100644 --- a/src/cmd/compile/internal/ssa/stackalloc.go +++ b/src/cmd/compile/internal/ssa/stackalloc.go @@ -56,10 +56,35 @@ func putStackAllocState(s *stackAllocState) { } type stackValState struct { - typ *types.Type - spill *Value - needSlot bool - isArg bool + typ *types.Type + spill *Value + needSlot bool + isArg bool + defBlock ID + useBlocks []stackUseBlock +} + +// addUseBlock adds a block to the set of blocks that uses this value. +// Note that we only loosely enforce the set property by checking the last +// block that was appended to the list and duplicates may occur. +// Because we add values block by block (barring phi-nodes), the number of duplicates is +// small and we deduplicate as part of the liveness algorithm later anyway. +func (sv *stackValState) addUseBlock(b *Block, liveout bool) { + entry := stackUseBlock{ + b: b, + liveout: liveout, + } + if sv.useBlocks == nil || sv.useBlocks[len(sv.useBlocks)-1] != entry { + sv.useBlocks = append(sv.useBlocks, stackUseBlock{ + b: b, + liveout: liveout, + }) + } +} + +type stackUseBlock struct { + b *Block + liveout bool } // stackalloc allocates storage in the stack frame for @@ -99,6 +124,7 @@ func (s *stackAllocState) init(f *Func, spillLive [][]ID) { s.values[v.ID].typ = v.Type s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable() && !v.OnWasmStack s.values[v.ID].isArg = hasAnyArgOp(v) + s.values[v.ID].defBlock = b.ID if f.pass.debug > stackDebug && s.values[v.ID].needSlot { fmt.Printf("%s needs a stack slot\n", v) } @@ -291,80 +317,89 @@ func (s *stackAllocState) stackalloc() { // computeLive computes a map from block ID to a list of // stack-slot-needing value IDs live at the end of that block. -// TODO: this could be quadratic if lots of variables are live across lots of -// basic blocks. Figure out a way to make this function (or, more precisely, the user -// of this function) require only linear size & time. func (s *stackAllocState) computeLive(spillLive [][]ID) { - s.live = make([][]ID, s.f.NumBlocks()) - var phis []*Value - live := s.f.newSparseSet(s.f.NumValues()) - defer s.f.retSparseSet(live) - t := s.f.newSparseSet(s.f.NumValues()) - defer s.f.retSparseSet(t) - - // Instead of iterating over f.Blocks, iterate over their postordering. - // Liveness information flows backward, so starting at the end - // increases the probability that we will stabilize quickly. - po := s.f.postorder() - for { - changed := false - for _, b := range po { - // Start with known live values at the end of the block - live.clear() - live.addAll(s.live[b.ID]) - - // Propagate backwards to the start of the block - phis = phis[:0] - for i := len(b.Values) - 1; i >= 0; i-- { - v := b.Values[i] - live.remove(v.ID) - if v.Op == OpPhi { - // Save phi for later. - // Note: its args might need a stack slot even though - // the phi itself doesn't. So don't use needSlot. - if !v.Type.IsMemory() && !v.Type.IsVoid() { - phis = append(phis, v) - } - continue - } - for _, a := range v.Args { - if s.values[a.ID].needSlot { - live.add(a.ID) - } - } - } - // for each predecessor of b, expand its list of live-at-end values - // invariant: s contains the values live at the start of b (excluding phi inputs) - for i, e := range b.Preds { - p := e.b - t.clear() - t.addAll(s.live[p.ID]) - t.addAll(live.contents()) - t.addAll(spillLive[p.ID]) - for _, v := range phis { - a := v.Args[i] - if s.values[a.ID].needSlot { - t.add(a.ID) - } - if spill := s.values[a.ID].spill; spill != nil { + // Because values using stack slots are few and far inbetween + // (compared to the set of all values), we use a path exploration + // algorithm to calculate liveness here. + f := s.f + for _, b := range f.Blocks { + for _, spillvid := range spillLive[b.ID] { + val := &s.values[spillvid] + val.addUseBlock(b, true) + } + for _, v := range b.Values { + for i, a := range v.Args { + val := &s.values[a.ID] + useBlock := b + forceLiveout := false + if v.Op == OpPhi { + useBlock = b.Preds[i].b + forceLiveout = true + if spill := val.spill; spill != nil { //TODO: remove? Subsumed by SpillUse? - t.add(spill.ID) + s.values[spill.ID].addUseBlock(useBlock, true) } } - if t.size() == len(s.live[p.ID]) { + if !val.needSlot { continue } - // grow p's live set - s.live[p.ID] = append(s.live[p.ID][:0], t.contents()...) - changed = true + val.addUseBlock(useBlock, forceLiveout) } } + } - if !changed { - break + s.live = make([][]ID, f.NumBlocks()) + push := func(bid, vid ID) { + l := s.live[bid] + if l == nil || l[len(l)-1] != vid { + l = append(l, vid) + s.live[bid] = l } } + // TODO: If we can help along the interference graph by calculating livein sets, + // we can do so trivially by turning this sparse set into an array of arrays + // and checking the top for the current value instead of inclusion in the sparse set. + seen := f.newSparseSet(f.NumBlocks()) + defer f.retSparseSet(seen) + // instead of pruning out duplicate blocks when we build the useblocks slices + // or when we add them to the queue, rely on the seen set to stop considering + // them. This is slightly faster than building the workqueues as sets + // + // However, this means that the queue can grow larger than the number of blocks, + // usually in very short functions. Returning a slice with values appended beyond the + // original allocation can corrupt the allocator state, so cap the queue and return + // the originally allocated slice regardless. + allocedBqueue := f.Cache.allocBlockSlice(f.NumBlocks()) + defer f.Cache.freeBlockSlice(allocedBqueue) + bqueue := allocedBqueue[:0:f.NumBlocks()] + + for vid, v := range s.values { + if !v.needSlot { + continue + } + seen.clear() + bqueue = bqueue[:0] + for _, b := range v.useBlocks { + if b.liveout { + push(b.b.ID, ID(vid)) + } + bqueue = append(bqueue, b.b) + } + for len(bqueue) > 0 { + work := bqueue[len(bqueue)-1] + bqueue = bqueue[:len(bqueue)-1] + if seen.contains(work.ID) || work.ID == v.defBlock { + continue + } + seen.add(work.ID) + for _, e := range work.Preds { + push(e.b.ID, ID(vid)) + bqueue = append(bqueue, e.b) + } + } + } + if s.f.pass.debug > stackDebug { for _, b := range s.f.Blocks { fmt.Printf("stacklive %s %v\n", b, s.live[b.ID]) -- 2.52.0