cmd/compile: move spills to loop exits when easy.

author David Chase <drchase@google.com>

Mon, 21 Mar 2016 15:32:04 +0000 (11:32 -0400)

committer David Chase <drchase@google.com>

Wed, 13 Apr 2016 15:59:42 +0000 (15:59 +0000)
author David Chase <drchase@google.com>
Mon, 21 Mar 2016 15:32:04 +0000 (11:32 -0400)
committer David Chase <drchase@google.com>
Wed, 13 Apr 2016 15:59:42 +0000 (15:59 +0000)
diff --git a/src/cmd/compile/internal/ssa/likelyadjust.go b/src/cmd/compile/internal/ssa/likelyadjust.go

index 76251bdd14df7f61cd276fce2cfe9ad5b5920078..2f52c4c6e6928fdba6d181bc5084f72d0b7efd49 100644 (file)
--- a/src/cmd/compile/internal/ssa/likelyadjust.go
+++ b/src/cmd/compile/internal/ssa/likelyadjust.go
@@ -11,11 +11,24 @@ import (
  type loop struct {
         header *Block // The header node of this (reducible) loop
         outer  *loop  // loop containing this loop
-       // Next two fields not currently used, but cheap to maintain,
-       // and aid in computation of inner-ness and list of blocks.
-       nBlocks      int32 // Number of blocks in this loop but not within inner loops
-       isInner      bool  // True if never discovered to contain a loop
-       containsCall bool  // if any block in this loop or any loop it contains is a BlockCall or BlockDefer
+
+       // By default, children exits, and depth are not initialized.
+       children []*loop  // loops nested directly within this loop. Initialized by assembleChildren().
+       exits    []*Block // exits records blocks reached by exits from this loop. Initialized by findExits().
+
+       // Loops aren't that common, so rather than force regalloc to keep
+       // a map or slice for its data, just put it here.
+       spills  []*Value
+       scratch int32
+
+       // Next three fields used by regalloc and/or
+       // aid in computation of inner-ness and list of blocks.
+       nBlocks int32 // Number of blocks in this loop but not within inner loops
+       depth   int16 // Nesting depth of the loop; 1 is outermost. Initialized by calculateDepths().
+       isInner bool  // True if never discovered to contain a loop
+
+       // register allocation uses this.
+       containsCall bool // if any block in this loop or any loop it contains is a BlockCall or BlockDefer
  }
  
  // outerinner records that outer contains inner
@@ -48,6 +61,9 @@ type loopnest struct {
         po    []*Block
         sdom  sparseTree
         loops []*loop
+
+       // Record which of the lazily initialized fields have actually been initialized.
+       initializedChildren, initializedDepth, initializedExits bool
  }
  
  func min8(a, b int8) int8 {
@@ -295,6 +311,35 @@ func loopnestfor(f *Func) *loopnest {
                         innermost.nBlocks++
                 }
         }
+
+       ln := &loopnest{f: f, b2l: b2l, po: po, sdom: sdom, loops: loops}
+
+       // Curious about the loopiness? "-d=ssa/likelyadjust/stats"
+       if f.pass.stats > 0 && len(loops) > 0 {
+               ln.assembleChildren()
+               ln.calculateDepths()
+               ln.findExits()
+
+               // Note stats for non-innermost loops are slightly flawed because
+               // they don't account for inner loop exits that span multiple levels.
+
+               for _, l := range loops {
+                       x := len(l.exits)
+                       cf := 0
+                       if !l.containsCall {
+                               cf = 1
+                       }
+                       inner := 0
+                       if l.isInner {
+                               inner++
+                       }
+
+                       f.logStat("loopstats:",
+                               l.depth, "depth", x, "exits",
+                               inner, "is_inner", cf, "is_callfree", l.nBlocks, "n_blocks")
+               }
+       }
+
         if f.pass.debug > 1 && len(loops) > 0 {
                 fmt.Printf("Loops in %s:\n", f.Name)
                 for _, l := range loops {
@@ -314,5 +359,90 @@ func loopnestfor(f *Func) *loopnest {
                 }
                 fmt.Print("\n")
         }
-       return &loopnest{f, b2l, po, sdom, loops}
+       return ln
+}
+
+// assembleChildren initializes the children field of each
+// loop in the nest.  Loop A is a child of loop B if A is
+// directly nested within B (based on the reducible-loops
+// detection above)
+func (ln *loopnest) assembleChildren() {
+       if ln.initializedChildren {
+               return
+       }
+       for _, l := range ln.loops {
+               if l.outer != nil {
+                       l.outer.children = append(l.outer.children, l)
+               }
+       }
+       ln.initializedChildren = true
+}
+
+// calculateDepths uses the children field of loops
+// to determine the nesting depth (outer=1) of each
+// loop.  This is helpful for finding exit edges.
+func (ln *loopnest) calculateDepths() {
+       if ln.initializedDepth {
+               return
+       }
+       ln.assembleChildren()
+       for _, l := range ln.loops {
+               if l.outer == nil {
+                       l.setDepth(1)
+               }
+       }
+       ln.initializedDepth = true
+}
+
+// findExits uses loop depth information to find the
+// exits from a loop.
+func (ln *loopnest) findExits() {
+       if ln.initializedExits {
+               return
+       }
+       ln.calculateDepths()
+       b2l := ln.b2l
+       for _, b := range ln.po {
+               l := b2l[b.ID]
+               if l != nil && len(b.Succs) == 2 {
+                       sl := b2l[b.Succs[0].ID]
+                       if recordIfExit(l, sl, b.Succs[0]) {
+                               continue
+                       }
+                       sl = b2l[b.Succs[1].ID]
+                       if recordIfExit(l, sl, b.Succs[1]) {
+                               continue
+                       }
+               }
+       }
+       ln.initializedExits = true
+}
+
+// recordIfExit checks sl (the loop containing b) to see if it
+// is outside of loop l, and if so, records b as an exit block
+// from l and returns true.
+func recordIfExit(l, sl *loop, b *Block) bool {
+       if sl != l {
+               if sl == nil || sl.depth <= l.depth {
+                       l.exits = append(l.exits, b)
+                       return true
+               }
+               // sl is not nil, and is deeper than l
+               // it's possible for this to be a goto into an irreducible loop made from gotos.
+               for sl.depth > l.depth {
+                       sl = sl.outer
+               }
+               if sl != l {
+                       l.exits = append(l.exits, b)
+                       return true
+               }
+       }
+       return false
+}
+
+func (l *loop) setDepth(d int16) {
+       l.depth = d
+       for _, c := range l.children {
+               c.setDepth(d + 1)
+       }
  }
diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go

index dfae8612d6198a3220a3eee4132c5dc1d80b55ed..d1de3646d9ff51c662de7f1813602085d3f11d70 100644 (file)
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -91,6 +91,13 @@
  // will have no use (so don't run deadcode after regalloc!).
  // TODO: maybe we should introduce these extra phis?
  
+// Additional not-quite-SSA output occurs when spills are sunk out
+// of loops to the targets of exit edges from the loop.  Before sinking,
+// there is one spill site (one StoreReg) targeting stack slot X, after
+// sinking there may be multiple spill sites targeting stack slot X,
+// with no phi functions at any join points reachable by the multiple
+// spill sites.
+
  package ssa
  
  import (
@@ -100,7 +107,8 @@ import (
  )
  
  const (
-       logSpills = iota
+       moveSpills = iota
+       logSpills
         regDebug
         stackDebug
  )
@@ -176,6 +184,7 @@ type valState struct {
         uses              *use    // list of uses in this block
         spill             *Value  // spilled copy of the Value
         spillUsed         bool
+       spillUsedShuffle  bool     // true if used in shuffling, after ordinary uses
         needReg           bool     // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
         rematerializeable bool     // cached value of v.rematerializeable()
         desired           register // register we want value to be in, if any
@@ -243,6 +252,15 @@ type regAllocState struct {
         loopnest *loopnest
  }
  
+type spillToSink struct {
+       spill *Value // Spill instruction to move (a StoreReg)
+       dests int32  // Bitmask indicating exit blocks from loop in which spill/val is defined. 1<<i set means val is live into loop.exitBlocks[i]
+}
+
+func (sts *spillToSink) spilledValue() *Value {
+       return sts.spill.Args[0]
+}
+
  type endReg struct {
         r register
         v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
@@ -558,6 +576,22 @@ func (s *regAllocState) compatRegs(t Type) regMask {
         return m &^ s.reserved()
  }
  
+// loopForBlock returns the loop containing block b,
+// provided that the loop is "interesting" for purposes
+// of improving register allocation (= is inner, and does
+// not contain a call)
+func (s *regAllocState) loopForBlock(b *Block) *loop {
+       loop := s.loopnest.b2l[b.ID]
+
+       // Minor for-the-time-being optimization: nothing happens
+       // unless a loop is both inner and call-free, therefore
+       // don't bother with other loops.
+       if loop != nil && (loop.containsCall || !loop.isInner) {
+               loop = nil
+       }
+       return loop
+}
+
  func (s *regAllocState) regalloc(f *Func) {
         liveSet := f.newSparseSet(f.NumValues())
         defer f.retSparseSet(liveSet)
@@ -566,12 +600,36 @@ func (s *regAllocState) regalloc(f *Func) {
         var phiRegs []register
         var args []*Value
  
+       // statistics
+       var nSpills int               // # of spills remaining
+       var nSpillsInner int          // # of spills remaining in inner loops
+       var nSpillsSunk int           // # of sunk spills remaining
+       var nSpillsChanged int        // # of sunk spills lost because of register use change
+       var nSpillsSunkUnused int     // # of spills not sunk because they were removed completely
+       var nSpillsNotSunkLateUse int // # of spills not sunk because of very late use (in shuffle)
+
         if f.Entry != f.Blocks[0] {
                 f.Fatalf("entry block must be first")
         }
  
+       // Get loop nest so that spills in inner loops can be
+       // tracked.  When the last block of a loop is processed,
+       // attempt to move spills out of the loop.
+       s.loopnest.findExits()
+
+       // Spills are moved from one block's slice of values to another's.
+       // This confuses register allocation if it occurs before it is
+       // complete, so candidates are recorded, then rechecked and
+       // moved after all allocation (register and stack) is complete.
+       // Because movement is only within a stack slot's lifetime, it
+       // is safe to do this.
+       var toSink []spillToSink
+       // Will be used to figure out live inputs to exit blocks of inner loops.
+       entryCandidates := newSparseMap(f.NumValues())
+
         for _, b := range f.Blocks {
                 s.curBlock = b
+               loop := s.loopForBlock(b)
  
                 // Initialize liveSet and uses fields for this block.
                 // Walk backwards through the block doing liveness analysis.
@@ -751,6 +809,11 @@ func (s *regAllocState) regalloc(f *Func) {
                                 s.setOrig(spill, v)
                                 s.values[v.ID].spill = spill
                                 s.values[v.ID].spillUsed = false
+                               if loop != nil {
+                                       loop.spills = append(loop.spills, v)
+                                       nSpillsInner++
+                               }
+                               nSpills++
                         }
  
                         // Save the starting state for use by merge edges.
@@ -970,6 +1033,11 @@ func (s *regAllocState) regalloc(f *Func) {
                                 s.setOrig(spill, v)
                                 s.values[v.ID].spill = spill
                                 s.values[v.ID].spillUsed = false
+                               if loop != nil {
+                                       loop.spills = append(loop.spills, v)
+                                       nSpillsInner++
+                               }
+                               nSpills++
                         }
                 }
  
@@ -1079,6 +1147,69 @@ func (s *regAllocState) regalloc(f *Func) {
                         s.values[e.ID].spillUsed = true
                 }
  
+               // Keep track of values that are spilled in the loop, but whose spill
+               // is not used in the loop.  It may be possible to move ("sink") the
+               // spill out of the loop into one or more exit blocks.
+               if loop != nil {
+                       loop.scratch++                    // increment count of blocks in this loop that have been processed
+                       if loop.scratch == loop.nBlocks { // just processed last block of loop, if it is an inner loop.
+                               // This check is redundant with code at the top of the loop.
+                               // This is definitive; the one at the top of the loop is an optimization.
+                               if loop.isInner && // Common case, easier, most likely to be profitable
+                                       !loop.containsCall && // Calls force spills, also lead to puzzling spill info.
+                                       len(loop.exits) <= 32 { // Almost no inner loops have more than 32 exits,
+                                       // and this allows use of a bitvector and a sparseMap.
+
+                                       // TODO: exit calculation is messed up for non-inner loops
+                                       // because of multilevel exits that are not part of the "exit"
+                                       // count.
+
+                                       // Compute the set of spill-movement candidates live at entry to exit blocks.
+                                       // isLoopSpillCandidate filters for
+                                       // (1) defined in appropriate loop
+                                       // (2) needs a register
+                                       // (3) spill not already used (in the loop)
+                                       // Condition (3) === "in a register at all loop exits"
+
+                                       entryCandidates.clear()
+
+                                       for whichExit, ss := range loop.exits {
+                                               // Start with live at end.
+                                               for _, li := range s.live[ss.ID] {
+                                                       if s.isLoopSpillCandidate(loop, s.orig[li.ID]) {
+                                                               entryCandidates.setBit(li.ID, uint(whichExit))
+                                                       }
+                                               }
+                                               // Control can also be live.
+                                               if ss.Control != nil && s.isLoopSpillCandidate(loop, ss.Control) {
+                                                       entryCandidates.setBit(ss.Control.ID, uint(whichExit))
+                                               }
+                                               // Walk backwards, filling in locally live values, removing those defined.
+                                               for i := len(ss.Values) - 1; i >= 0; i-- {
+                                                       v := ss.Values[i]
+                                                       entryCandidates.remove(v.ID) // Cannot be an issue, only keeps the sets smaller.
+                                                       for _, a := range v.Args {
+                                                               if s.isLoopSpillCandidate(loop, a) {
+                                                                       entryCandidates.setBit(a.ID, uint(whichExit))
+                                                               }
+                                                       }
+                                               }
+                                       }
+
+                                       for _, e := range loop.spills {
+                                               whichblocks := entryCandidates.get(e.ID)
+                                               oldSpill := s.values[e.ID].spill
+                                               if whichblocks != 0 && whichblocks != -1 { // -1 = not in map.
+                                                       toSink = append(toSink, spillToSink{spill: oldSpill, dests: whichblocks})
+                                               }
+                                       }
+
+                               } // loop is inner etc
+                               loop.scratch = 0 // Don't leave a mess, just in case.
+                               loop.spills = nil
+                       } // if scratch == nBlocks
+               } // if loop is not nil
+
                 // Clear any final uses.
                 // All that is left should be the pseudo-uses added for values which
                 // are live at the end of b.
@@ -1110,9 +1241,16 @@ func (s *regAllocState) regalloc(f *Func) {
                         // Constants, SP, SB, ...
                         continue
                 }
+               loop := s.loopForBlock(spill.Block)
+               if loop != nil {
+                       nSpillsInner--
+               }
+
                 spill.Args[0].Uses--
                 f.freeValue(spill)
+               nSpills--
         }
+
         for _, b := range f.Blocks {
                 i := 0
                 for _, v := range b.Values {
@@ -1127,12 +1265,153 @@ func (s *regAllocState) regalloc(f *Func) {
                 // Not important now because this is the last phase that manipulates Values
         }
  
+       // Must clear these out before any potential recycling, though that's
+       // not currently implemented.
+       for i, ts := range toSink {
+               vsp := ts.spill
+               if vsp.Op == OpInvalid { // This spill was completely eliminated
+                       toSink[i].spill = nil
+               }
+       }
+
         // Anything that didn't get a register gets a stack location here.
         // (StoreReg, stack-based phis, inputs, ...)
         stacklive := stackalloc(s.f, s.spillLive)
  
         // Fix up all merge edges.
         s.shuffle(stacklive)
+
+       // Insert moved spills (that have not been marked invalid above)
+       // at start of appropriate block and remove the originals from their
+       // location within loops.  Notice that this can break SSA form;
+       // if a spill is sunk to multiple exits, there will be no phi for that
+       // spill at a join point downstream of those two exits, though the
+       // two spills will target the same stack slot.  Notice also that this
+       // takes place after stack allocation, so the stack allocator does
+       // not need to process these malformed flow graphs.
+sinking:
+       for _, ts := range toSink {
+               vsp := ts.spill
+               if vsp == nil { // This spill was completely eliminated
+                       nSpillsSunkUnused++
+                       continue sinking
+               }
+               e := ts.spilledValue()
+               if s.values[e.ID].spillUsedShuffle {
+                       nSpillsNotSunkLateUse++
+                       continue sinking
+               }
+
+               // move spills to a better (outside of loop) block.
+               // This would be costly if it occurred very often, but it doesn't.
+               b := vsp.Block
+               loop := s.loopnest.b2l[b.ID]
+               dests := ts.dests
+
+               // Pre-check to be sure that spilled value is still in expected register on all exits where live.
+       check_val_still_in_reg:
+               for i := uint(0); i < 32 && dests != 0; i++ {
+
+                       if dests&(1<<i) == 0 {
+                               continue
+                       }
+                       dests ^= 1 << i
+                       d := loop.exits[i]
+                       if len(d.Preds) > 1 {
+                               panic("Should be impossible given critical edges removed")
+                       }
+                       p := d.Preds[0] // block in loop exiting to d.
+
+                       endregs := s.endRegs[p.ID]
+                       for _, regrec := range endregs {
+                               if regrec.v == e && regrec.r != noRegister && regrec.c == e { // TODO: regrec.c != e implies different spill possible.
+                                       continue check_val_still_in_reg
+                               }
+                       }
+                       // If here, the register assignment was lost down at least one exit and it can't be sunk
+                       if s.f.pass.debug > moveSpills {
+                               s.f.Config.Warnl(e.Line, "lost register assignment for spill %v in %v at exit %v to %v",
+                                       vsp, b, p, d)
+                       }
+                       nSpillsChanged++
+                       continue sinking
+               }
+
+               nSpillsSunk++
+               nSpillsInner--
+               // don't update nSpills, since spill is only moved, and if it is duplicated, the spills-on-a-path is not increased.
+
+               dests = ts.dests
+
+               // remove vsp from b.Values
+               i := 0
+               for _, w := range b.Values {
+                       if vsp == w {
+                               continue
+                       }
+                       b.Values[i] = w
+                       i++
+               }
+               b.Values = b.Values[:i]
+
+               for i := uint(0); i < 32 && dests != 0; i++ {
+
+                       if dests&(1<<i) == 0 {
+                               continue
+                       }
+
+                       dests ^= 1 << i
+
+                       d := loop.exits[i]
+                       vspnew := d.NewValue1(e.Line, OpStoreReg, e.Type, e)
+
+                       if s.f.pass.debug > moveSpills {
+                               s.f.Config.Warnl(e.Line, "moved spill %v in %v for %v to %v in %v",
+                                       vsp, b, e, vspnew, d)
+                       }
+
+                       f.setHome(vspnew, f.getHome(vsp.ID)) // copy stack home
+
+                       // shuffle vspnew to the beginning of its block
+                       copy(d.Values[1:], d.Values[0:len(d.Values)-1])
+                       d.Values[0] = vspnew
+               }
+       }
+
+       if f.pass.stats > 0 {
+               f.logStat("spills_info",
+                       nSpills, "spills", nSpillsInner, "inner_spills_remaining", nSpillsSunk, "inner_spills_sunk", nSpillsSunkUnused, "inner_spills_unused", nSpillsNotSunkLateUse, "inner_spills_shuffled", nSpillsChanged, "inner_spills_changed")
+       }
+}
+
+// isLoopSpillCandidate indicates whether the spill for v satisfies preliminary
+// spill-sinking conditions just after the last block of loop has been processed.
+// In particular:
+//   v needs a register.
+//   v's spill is not (YET) used.
+//   v's definition is within loop.
+// The spill may be used in the future, either by an outright use
+// in the code, or by shuffling code inserted after stack allocation.
+// Outright uses cause sinking; shuffling (within the loop) inhibits it.
+func (s *regAllocState) isLoopSpillCandidate(loop *loop, v *Value) bool {
+       return s.values[v.ID].needReg && !s.values[v.ID].spillUsed && s.loopnest.b2l[v.Block.ID] == loop
+}
+
+// lateSpillUse notes a late (after stack allocation) use of spill c
+// This will inhibit spill sinking.
+func (s *regAllocState) lateSpillUse(c *Value) {
+       // TODO investigate why this is necessary.
+       // It appears that an outside-the-loop use of
+       // an otherwise sinkable spill makes the spill
+       // a candidate for shuffling, when it would not
+       // otherwise have been the case (spillUsed was not
+       // true when isLoopSpillCandidate was called, yet
+       // it was shuffled).  Such shuffling cuts the amount
+       // of spill sinking by more than half (in make.bash)
+       v := s.orig[c.ID]
+       if v != nil {
+               s.values[v.ID].spillUsedShuffle = true
+       }
  }
  
  // shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
@@ -1307,6 +1586,7 @@ func (e *edgeState) process() {
                 if _, isReg := loc.(*Register); isReg {
                         c = e.p.NewValue1(c.Line, OpCopy, c.Type, c)
                 } else {
+                       e.s.lateSpillUse(c)
                         c = e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
                 }
                 e.set(r, vid, c, false)
@@ -1395,6 +1675,7 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value) bool {
                         }
                 } else {
                         if dstReg {
+                               e.s.lateSpillUse(c)
                                 x = e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
                         } else {
                                 // mem->mem. Use temp register.
@@ -1412,6 +1693,7 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value) bool {
                                 e.erase(loc)
  
                                 r := e.findRegFor(c.Type)
+                               e.s.lateSpillUse(c)
                                 t := e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
                                 e.set(r, vid, t, false)
                                 x = e.p.NewValue1(c.Line, OpStoreReg, loc.(LocalSlot).Type, t)
diff --git a/src/cmd/compile/internal/ssa/sparsemap.go b/src/cmd/compile/internal/ssa/sparsemap.go

index 6c0043b2300d6b7a05374e94aefb96cd535aa1c4..0211a70f09d2117db359736219020ec7550b5141 100644 (file)
--- a/src/cmd/compile/internal/ssa/sparsemap.go
+++ b/src/cmd/compile/internal/ssa/sparsemap.go
@@ -32,6 +32,8 @@ func (s *sparseMap) contains(k ID) bool {
         return i < len(s.dense) && s.dense[i].key == k
  }
  
+// get returns the value for key k, or -1 if k does
+// not appear in the map.
  func (s *sparseMap) get(k ID) int32 {
         i := s.sparse[k]
         if i < len(s.dense) && s.dense[i].key == k {
@@ -50,6 +52,20 @@ func (s *sparseMap) set(k ID, v int32) {
         s.sparse[k] = len(s.dense) - 1
  }
  
+// setBit sets the v'th bit of k's value, where 0 <= v < 32
+func (s *sparseMap) setBit(k ID, v uint) {
+       if v >= 32 {
+               panic("bit index too large.")
+       }
+       i := s.sparse[k]
+       if i < len(s.dense) && s.dense[i].key == k {
+               s.dense[i].val |= 1 << v
+               return
+       }
+       s.dense = append(s.dense, sparseEntry{k, 1 << v})
+       s.sparse[k] = len(s.dense) - 1
+}
+
  func (s *sparseMap) remove(k ID) {
         i := s.sparse[k]
         if i < len(s.dense) && s.dense[i].key == k {
author	David Chase <drchase@google.com>
	Mon, 21 Mar 2016 15:32:04 +0000 (11:32 -0400)
committer	David Chase <drchase@google.com>
	Wed, 13 Apr 2016 15:59:42 +0000 (15:59 +0000)
src/cmd/compile/internal/ssa/likelyadjust.go		patch \| blob \| history
src/cmd/compile/internal/ssa/regalloc.go		patch \| blob \| history
src/cmd/compile/internal/ssa/sparsemap.go		patch \| blob \| history