]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: optimize liveness in stackalloc
authorDaniel Morsing <daniel.morsing@gmail.com>
Wed, 5 Nov 2025 18:33:44 +0000 (18:33 +0000)
committerGopher Robot <gobot@golang.org>
Wed, 12 Nov 2025 18:02:27 +0000 (10:02 -0800)
The stackalloc code needs to run a liveness pass to build the
interference graph between stack slots. Because the values that we need
liveness over is so sparse, we can optimize the analysis by using a path
exploration algorithm rather than a iterative dataflow one

In local testing, this cuts 74.05 ms of CPU time off a build of cmd/compile.

Change-Id: I765ace87d5e8aae177e65eb63da482e3d698bea7
Reviewed-on: https://go-review.googlesource.com/c/go/+/718540
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
src/cmd/compile/internal/ssa/stackalloc.go

index 11ffe5b55ea65fed0bdea4660530b9352c1a8fae..06097e95da0bc1dac948408408a9b0ac31745e7c 100644 (file)
@@ -56,10 +56,35 @@ func putStackAllocState(s *stackAllocState) {
 }
 
 type stackValState struct {
-       typ      *types.Type
-       spill    *Value
-       needSlot bool
-       isArg    bool
+       typ       *types.Type
+       spill     *Value
+       needSlot  bool
+       isArg     bool
+       defBlock  ID
+       useBlocks []stackUseBlock
+}
+
+// addUseBlock adds a block to the set of blocks that uses this value.
+// Note that we only loosely enforce the set property by checking the last
+// block that was appended to the list and duplicates may occur.
+// Because we add values block by block (barring phi-nodes), the number of duplicates is
+// small and we deduplicate as part of the liveness algorithm later anyway.
+func (sv *stackValState) addUseBlock(b *Block, liveout bool) {
+       entry := stackUseBlock{
+               b:       b,
+               liveout: liveout,
+       }
+       if sv.useBlocks == nil || sv.useBlocks[len(sv.useBlocks)-1] != entry {
+               sv.useBlocks = append(sv.useBlocks, stackUseBlock{
+                       b:       b,
+                       liveout: liveout,
+               })
+       }
+}
+
+type stackUseBlock struct {
+       b       *Block
+       liveout bool
 }
 
 // stackalloc allocates storage in the stack frame for
@@ -99,6 +124,7 @@ func (s *stackAllocState) init(f *Func, spillLive [][]ID) {
                        s.values[v.ID].typ = v.Type
                        s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable() && !v.OnWasmStack
                        s.values[v.ID].isArg = hasAnyArgOp(v)
+                       s.values[v.ID].defBlock = b.ID
                        if f.pass.debug > stackDebug && s.values[v.ID].needSlot {
                                fmt.Printf("%s needs a stack slot\n", v)
                        }
@@ -291,80 +317,89 @@ func (s *stackAllocState) stackalloc() {
 
 // computeLive computes a map from block ID to a list of
 // stack-slot-needing value IDs live at the end of that block.
-// TODO: this could be quadratic if lots of variables are live across lots of
-// basic blocks. Figure out a way to make this function (or, more precisely, the user
-// of this function) require only linear size & time.
 func (s *stackAllocState) computeLive(spillLive [][]ID) {
-       s.live = make([][]ID, s.f.NumBlocks())
-       var phis []*Value
-       live := s.f.newSparseSet(s.f.NumValues())
-       defer s.f.retSparseSet(live)
-       t := s.f.newSparseSet(s.f.NumValues())
-       defer s.f.retSparseSet(t)
-
-       // Instead of iterating over f.Blocks, iterate over their postordering.
-       // Liveness information flows backward, so starting at the end
-       // increases the probability that we will stabilize quickly.
-       po := s.f.postorder()
-       for {
-               changed := false
-               for _, b := range po {
-                       // Start with known live values at the end of the block
-                       live.clear()
-                       live.addAll(s.live[b.ID])
-
-                       // Propagate backwards to the start of the block
-                       phis = phis[:0]
-                       for i := len(b.Values) - 1; i >= 0; i-- {
-                               v := b.Values[i]
-                               live.remove(v.ID)
-                               if v.Op == OpPhi {
-                                       // Save phi for later.
-                                       // Note: its args might need a stack slot even though
-                                       // the phi itself doesn't. So don't use needSlot.
-                                       if !v.Type.IsMemory() && !v.Type.IsVoid() {
-                                               phis = append(phis, v)
-                                       }
-                                       continue
-                               }
-                               for _, a := range v.Args {
-                                       if s.values[a.ID].needSlot {
-                                               live.add(a.ID)
-                                       }
-                               }
-                       }
 
-                       // for each predecessor of b, expand its list of live-at-end values
-                       // invariant: s contains the values live at the start of b (excluding phi inputs)
-                       for i, e := range b.Preds {
-                               p := e.b
-                               t.clear()
-                               t.addAll(s.live[p.ID])
-                               t.addAll(live.contents())
-                               t.addAll(spillLive[p.ID])
-                               for _, v := range phis {
-                                       a := v.Args[i]
-                                       if s.values[a.ID].needSlot {
-                                               t.add(a.ID)
-                                       }
-                                       if spill := s.values[a.ID].spill; spill != nil {
+       // Because values using stack slots are few and far inbetween
+       // (compared to the set of all values), we use a path exploration
+       // algorithm to calculate liveness here.
+       f := s.f
+       for _, b := range f.Blocks {
+               for _, spillvid := range spillLive[b.ID] {
+                       val := &s.values[spillvid]
+                       val.addUseBlock(b, true)
+               }
+               for _, v := range b.Values {
+                       for i, a := range v.Args {
+                               val := &s.values[a.ID]
+                               useBlock := b
+                               forceLiveout := false
+                               if v.Op == OpPhi {
+                                       useBlock = b.Preds[i].b
+                                       forceLiveout = true
+                                       if spill := val.spill; spill != nil {
                                                //TODO: remove?  Subsumed by SpillUse?
-                                               t.add(spill.ID)
+                                               s.values[spill.ID].addUseBlock(useBlock, true)
                                        }
                                }
-                               if t.size() == len(s.live[p.ID]) {
+                               if !val.needSlot {
                                        continue
                                }
-                               // grow p's live set
-                               s.live[p.ID] = append(s.live[p.ID][:0], t.contents()...)
-                               changed = true
+                               val.addUseBlock(useBlock, forceLiveout)
                        }
                }
+       }
 
-               if !changed {
-                       break
+       s.live = make([][]ID, f.NumBlocks())
+       push := func(bid, vid ID) {
+               l := s.live[bid]
+               if l == nil || l[len(l)-1] != vid {
+                       l = append(l, vid)
+                       s.live[bid] = l
                }
        }
+       // TODO: If we can help along the interference graph by calculating livein sets,
+       // we can do so trivially by turning this sparse set into an array of arrays
+       // and checking the top for the current value instead of inclusion in the sparse set.
+       seen := f.newSparseSet(f.NumBlocks())
+       defer f.retSparseSet(seen)
+       // instead of pruning out duplicate blocks when we build the useblocks slices
+       // or when we add them to the queue, rely on the seen set to stop considering
+       // them. This is slightly faster than building the workqueues as sets
+       //
+       // However, this means that the queue can grow larger than the number of blocks,
+       // usually in very short functions. Returning a slice with values appended beyond the
+       // original allocation can corrupt the allocator state, so cap the queue and return
+       // the originally allocated slice regardless.
+       allocedBqueue := f.Cache.allocBlockSlice(f.NumBlocks())
+       defer f.Cache.freeBlockSlice(allocedBqueue)
+       bqueue := allocedBqueue[:0:f.NumBlocks()]
+
+       for vid, v := range s.values {
+               if !v.needSlot {
+                       continue
+               }
+               seen.clear()
+               bqueue = bqueue[:0]
+               for _, b := range v.useBlocks {
+                       if b.liveout {
+                               push(b.b.ID, ID(vid))
+                       }
+                       bqueue = append(bqueue, b.b)
+               }
+               for len(bqueue) > 0 {
+                       work := bqueue[len(bqueue)-1]
+                       bqueue = bqueue[:len(bqueue)-1]
+                       if seen.contains(work.ID) || work.ID == v.defBlock {
+                               continue
+                       }
+                       seen.add(work.ID)
+                       for _, e := range work.Preds {
+                               push(e.b.ID, ID(vid))
+                               bqueue = append(bqueue, e.b)
+                       }
+               }
+       }
+
        if s.f.pass.debug > stackDebug {
                for _, b := range s.f.Blocks {
                        fmt.Printf("stacklive %s %v\n", b, s.live[b.ID])