cmd/compile: use depth first topological sort algorithm for layout

author erifan01 <eric.fang@arm.com>

Thu, 23 Jul 2020 02:24:56 +0000 (10:24 +0800)

committer eric fang <eric.fang@arm.com>

Tue, 16 Mar 2021 02:44:54 +0000 (02:44 +0000)
author erifan01 <eric.fang@arm.com>
Thu, 23 Jul 2020 02:24:56 +0000 (10:24 +0800)
committer eric fang <eric.fang@arm.com>
Tue, 16 Mar 2021 02:44:54 +0000 (02:44 +0000)
diff --git a/src/cmd/compile/internal/ssa/block.go b/src/cmd/compile/internal/ssa/block.go

index 937c757b2153bed33b90b9981f3ba498b8cf022e..71ca774431e33cb99a4344d1aaa6e341a052997a 100644 (file)
--- a/src/cmd/compile/internal/ssa/block.go
+++ b/src/cmd/compile/internal/ssa/block.go
@@ -358,6 +358,22 @@ func (b *Block) AuxIntString() string {
         }
  }
  
+// likelyBranch reports whether block b is the likely branch of all of its predecessors.
+func (b *Block) likelyBranch() bool {
+       if len(b.Preds) == 0 {
+               return false
+       }
+       for _, e := range b.Preds {
+               p := e.b
+               if len(p.Succs) == 1 || len(p.Succs) == 2 && (p.Likely == BranchLikely && p.Succs[0].b == b ||
+                       p.Likely == BranchUnlikely && p.Succs[1].b == b) {
+                       continue
+               }
+               return false
+       }
+       return true
+}
+
  func (b *Block) Logf(msg string, args ...interface{})   { b.Func.Logf(msg, args...) }
  func (b *Block) Log() bool                              { return b.Func.Log() }
  func (b *Block) Fatalf(msg string, args ...interface{}) { b.Func.Fatalf(msg, args...) }
diff --git a/src/cmd/compile/internal/ssa/layout.go b/src/cmd/compile/internal/ssa/layout.go

index 30b7b97d040cdb040ab12b99c520ec27cac3bca4..a7fd73aead19815b2bd59d3ad34a7c032a6ed8cc 100644 (file)
--- a/src/cmd/compile/internal/ssa/layout.go
+++ b/src/cmd/compile/internal/ssa/layout.go
@@ -41,8 +41,13 @@ func layoutOrder(f *Func) []*Block {
         indegree := make([]int, f.NumBlocks())
         posdegree := f.newSparseSet(f.NumBlocks()) // blocks with positive remaining degree
         defer f.retSparseSet(posdegree)
-       zerodegree := f.newSparseSet(f.NumBlocks()) // blocks with zero remaining degree
-       defer f.retSparseSet(zerodegree)
+       // blocks with zero remaining degree. Use slice to simulate a LIFO queue to implement
+       // the depth-first topology sorting algorithm.
+       var zerodegree []ID
+       // LIFO queue. Track the successor blocks of the scheduled block so that when we
+       // encounter loops, we choose to schedule the successor block of the most recently
+       // scheduled block.
+       var succs []ID
         exit := f.newSparseSet(f.NumBlocks()) // exit blocks
         defer f.retSparseSet(exit)
  
@@ -88,7 +93,8 @@ func layoutOrder(f *Func) []*Block {
                 }
                 indegree[b.ID] = len(b.Preds)
                 if len(b.Preds) == 0 {
-                       zerodegree.add(b.ID)
+                       // Push an element to the tail of the queue.
+                       zerodegree = append(zerodegree, b.ID)
                 } else {
                         posdegree.add(b.ID)
                 }
@@ -105,12 +111,24 @@ blockloop:
                         break
                 }
  
-               for _, e := range b.Succs {
-                       c := e.b
+               // Here, the order of traversing the b.Succs affects the direction in which the topological
+               // sort advances in depth. Take the following cfg as an example, regardless of other factors.
+               //           b1
+               //         0/ \1
+               //        b2   b3
+               // Traverse b.Succs in order, the right child node b3 will be scheduled immediately after
+               // b1, traverse b.Succs in reverse order, the left child node b2 will be scheduled
+               // immediately after b1. The test results show that reverse traversal performs a little
+               // better.
+               // Note: You need to consider both layout and register allocation when testing performance.
+               for i := len(b.Succs) - 1; i >= 0; i-- {
+                       c := b.Succs[i].b
                         indegree[c.ID]--
                         if indegree[c.ID] == 0 {
                                 posdegree.remove(c.ID)
-                               zerodegree.add(c.ID)
+                               zerodegree = append(zerodegree, c.ID)
+                       } else {
+                               succs = append(succs, c.ID)
                         }
                 }
  
@@ -132,30 +150,30 @@ blockloop:
  
                 // Use degree for now.
                 bid = 0
-               mindegree := f.NumBlocks()
-               for _, e := range order[len(order)-1].Succs {
-                       c := e.b
-                       if scheduled[c.ID] || c.Kind == BlockExit {
-                               continue
-                       }
-                       if indegree[c.ID] < mindegree {
-                               mindegree = indegree[c.ID]
-                               bid = c.ID
-                       }
-               }
-               if bid != 0 {
-                       continue
-               }
                 // TODO: improve this part
                 // No successor of the previously scheduled block works.
                 // Pick a zero-degree block if we can.
-               for zerodegree.size() > 0 {
-                       cid := zerodegree.pop()
+               for len(zerodegree) > 0 {
+                       // Pop an element from the tail of the queue.
+                       cid := zerodegree[len(zerodegree)-1]
+                       zerodegree = zerodegree[:len(zerodegree)-1]
+                       if !scheduled[cid] {
+                               bid = cid
+                               continue blockloop
+                       }
+               }
+
+               // Still nothing, pick the unscheduled successor block encountered most recently.
+               for len(succs) > 0 {
+                       // Pop an element from the tail of the queue.
+                       cid := succs[len(succs)-1]
+                       succs = succs[:len(succs)-1]
                         if !scheduled[cid] {
                                 bid = cid
                                 continue blockloop
                         }
                 }
+
                 // Still nothing, pick any non-exit block.
                 for posdegree.size() > 0 {
                         cid := posdegree.pop()
diff --git a/src/cmd/compile/internal/ssa/looprotate.go b/src/cmd/compile/internal/ssa/looprotate.go

index 2e5e421df7ffa14f9d573129b0eaec7610182f10..35010a78d8e0492623bb3b017337aab9530c2719 100644 (file)
--- a/src/cmd/compile/internal/ssa/looprotate.go
+++ b/src/cmd/compile/internal/ssa/looprotate.go
@@ -68,12 +68,15 @@ func loopRotate(f *Func) {
                         if nextb == p { // original loop predecessor is next
                                 break
                         }
-                       if loopnest.b2l[nextb.ID] != loop { // about to leave loop
-                               break
+                       if loopnest.b2l[nextb.ID] == loop {
+                               after[p.ID] = append(after[p.ID], nextb)
                         }
-                       after[p.ID] = append(after[p.ID], nextb)
                         b = nextb
                 }
+               // Swap b and p so that we'll handle p before b when moving blocks.
+               f.Blocks[idToIdx[loop.header.ID]] = p
+               f.Blocks[idToIdx[p.ID]] = loop.header
+               idToIdx[loop.header.ID], idToIdx[p.ID] = idToIdx[p.ID], idToIdx[loop.header.ID]
  
                 // Place b after p.
                 for _, b := range after[p.ID] {
@@ -86,21 +89,23 @@ func loopRotate(f *Func) {
         // before the rest of the loop.  And that relies on the
         // fact that we only identify reducible loops.
         j := 0
-       for i, b := range f.Blocks {
+       // Some blocks that are not part of a loop may be placed
+       // between loop blocks. In order to avoid these blocks from
+       // being overwritten, use a temporary slice.
+       newOrder := make([]*Block, 0, f.NumBlocks())
+       for _, b := range f.Blocks {
                 if _, ok := move[b.ID]; ok {
                         continue
                 }
-               f.Blocks[j] = b
+               newOrder = append(newOrder, b)
                 j++
                 for _, a := range after[b.ID] {
-                       if j > i {
-                               f.Fatalf("head before tail in loop %s", b)
-                       }
-                       f.Blocks[j] = a
+                       newOrder = append(newOrder, a)
                         j++
                 }
         }
         if j != len(f.Blocks) {
                 f.Fatalf("bad reordering in looprotate")
         }
+       f.Blocks = newOrder
  }
diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go

index c104a36888b4cb35fef821c1e11c57c300dfb869..18908681df3badb5f104c5399f220c7e62d76a13 100644 (file)
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -241,12 +241,6 @@ type regAllocState struct {
         GReg        register
         allocatable regMask
  
-       // for each block, its primary predecessor.
-       // A predecessor of b is primary if it is the closest
-       // predecessor that appears before b in the layout order.
-       // We record the index in the Preds list where the primary predecessor sits.
-       primary []int32
-
         // live values at the end of each block.  live[b.ID] is a list of value IDs
         // which are live at the end of b, together with a count of how many instructions
         // forward to the next use.
@@ -304,6 +298,9 @@ type regAllocState struct {
  
         // choose a good order in which to visit blocks for allocation purposes.
         visitOrder []*Block
+
+       // blockOrder[b.ID] corresponds to the index of block b in visitOrder.
+       blockOrder []int32
  }
  
  type endReg struct {
@@ -636,9 +633,9 @@ func (s *regAllocState) init(f *Func) {
  
         // Compute block order. This array allows us to distinguish forward edges
         // from backward edges and compute how far they go.
-       blockOrder := make([]int32, f.NumBlocks())
+       s.blockOrder = make([]int32, f.NumBlocks())
         for i, b := range s.visitOrder {
-               blockOrder[b.ID] = int32(i)
+               s.blockOrder[b.ID] = int32(i)
         }
  
         s.regs = make([]regState, s.numRegs)
@@ -664,22 +661,6 @@ func (s *regAllocState) init(f *Func) {
         }
         s.computeLive()
  
-       // Compute primary predecessors.
-       s.primary = make([]int32, f.NumBlocks())
-       for _, b := range s.visitOrder {
-               best := -1
-               for i, e := range b.Preds {
-                       p := e.b
-                       if blockOrder[p.ID] >= blockOrder[b.ID] {
-                               continue // backward edge
-                       }
-                       if best == -1 || blockOrder[p.ID] > blockOrder[b.Preds[best].b.ID] {
-                               best = i
-                       }
-               }
-               s.primary[b.ID] = int32(best)
-       }
-
         s.endRegs = make([][]endReg, f.NumBlocks())
         s.startRegs = make([][]startReg, f.NumBlocks())
         s.spillLive = make([][]ID, f.NumBlocks())
@@ -957,10 +938,49 @@ func (s *regAllocState) regalloc(f *Func) {
                         // This is the complicated case. We have more than one predecessor,
                         // which means we may have Phi ops.
  
-                       // Start with the final register state of the primary predecessor
-                       idx := s.primary[b.ID]
+                       // Start with the final register state of the predecessor with least spill values.
+                       // This is based on the following points:
+                       // 1, The less spill value indicates that the register pressure of this path is smaller,
+                       //    so the values of this block are more likely to be allocated to registers.
+                       // 2, Avoid the predecessor that contains the function call, because the predecessor that
+                       //    contains the function call usually generates a lot of spills and lose the previous
+                       //    allocation state.
+                       // TODO: Improve this part. At least the size of endRegs of the predecessor also has
+                       // an impact on the code size and compiler speed. But it is not easy to find a simple
+                       // and efficient method that combines multiple factors.
+                       idx := -1
+                       for i, p := range b.Preds {
+                               // If the predecessor has not been visited yet, skip it because its end state
+                               // (redRegs and spillLive) has not been computed yet.
+                               pb := p.b
+                               if s.blockOrder[pb.ID] >= s.blockOrder[b.ID] {
+                                       continue
+                               }
+                               if idx == -1 {
+                                       idx = i
+                                       continue
+                               }
+                               pSel := b.Preds[idx].b
+                               if len(s.spillLive[pb.ID]) < len(s.spillLive[pSel.ID]) {
+                                       idx = i
+                               } else if len(s.spillLive[pb.ID]) == len(s.spillLive[pSel.ID]) {
+                                       // Use a bit of likely information. After critical pass, pb and pSel must
+                                       // be plain blocks, so check edge pb->pb.Preds instead of edge pb->b.
+                                       // TODO: improve the prediction of the likely predecessor. The following
+                                       // method is only suitable for the simplest cases. For complex cases,
+                                       // the prediction may be inaccurate, but this does not affect the
+                                       // correctness of the program.
+                                       // According to the layout algorithm, the predecessor with the
+                                       // smaller blockOrder is the true branch, and the test results show
+                                       // that it is better to choose the predecessor with a smaller
+                                       // blockOrder than no choice.
+                                       if pb.likelyBranch() && !pSel.likelyBranch() || s.blockOrder[pb.ID] < s.blockOrder[pSel.ID] {
+                                               idx = i
+                                       }
+                               }
+                       }
                         if idx < 0 {
-                               f.Fatalf("block with no primary predecessor %s", b)
+                               f.Fatalf("bad visitOrder, no predecessor of %s has been visited before it", b)
                         }
                         p := b.Preds[idx].b
                         s.setState(s.endRegs[p.ID])
@@ -1048,7 +1068,7 @@ func (s *regAllocState) regalloc(f *Func) {
                                 // If one of the other inputs of v is in a register, and the register is available,
                                 // select this register, which can save some unnecessary copies.
                                 for i, pe := range b.Preds {
-                                       if int32(i) == idx {
+                                       if i == idx {
                                                 continue
                                         }
                                         ri := noRegister
diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go

index 0bdb66a376db2acd133d05a357dcd69c6e740c11..dea7e0ba61abbbc3d0ae3af494cde812c2a11b5a 100644 (file)
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@@ -322,6 +322,9 @@ func NoFix64A(divr int64) (int64, int64) {
         if divr > 5 {
                 d /= divr // amd64:-"JMP"
                 e %= divr // amd64:-"JMP"
+               // The following statement is to avoid conflict between the above check
+               // and the normal JMP generated at the end of the block.
+               d += e
         }
         return d, e
  }
@@ -333,6 +336,7 @@ func NoFix64B(divd int64) (int64, int64) {
         if divd > -9223372036854775808 {
                 d = divd / divr // amd64:-"JMP"
                 e = divd % divr // amd64:-"JMP"
+               d += e
         }
         return d, e
  }
@@ -347,6 +351,7 @@ func NoFix32A(divr int32) (int32, int32) {
                 // amd64:-"JMP"
                 // 386:-"JMP"
                 e %= divr
+               d += e
         }
         return d, e
  }
@@ -362,6 +367,7 @@ func NoFix32B(divd int32) (int32, int32) {
                 // amd64:-"JMP"
                 // 386:-"JMP"
                 e = divd % divr
+               d += e
         }
         return d, e
  }
@@ -376,6 +382,7 @@ func NoFix16A(divr int16) (int16, int16) {
                 // amd64:-"JMP"
                 // 386:-"JMP"
                 e %= divr
+               d += e
         }
         return d, e
  }
@@ -391,6 +398,7 @@ func NoFix16B(divd int16) (int16, int16) {
                 // amd64:-"JMP"
                 // 386:-"JMP"
                 e = divd % divr
+               d += e
         }
         return d, e
  }
diff --git a/test/codegen/comparisons.go b/test/codegen/comparisons.go

index 02bed386615cf9297a21782f8bca985c34cfb75d..719063cdc38041f77bcf1ca9e99c9b8eb6ad1c62 100644 (file)
--- a/test/codegen/comparisons.go
+++ b/test/codegen/comparisons.go
@@ -426,7 +426,7 @@ func UintGeqZero(a uint8, b uint16, c uint32, d uint64) int {
  }
  
  func UintGtZero(a uint8, b uint16, c uint32, d uint64) int {
-       // arm64: `CBZW`, `CBNZW`, `CBNZ`, -`(CMPW|CMP|BLS|BHI)`
+       // arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BLS|BHI)`
         if a > 0 || b > 0 || c > 0 || d > 0 {
                 return 1
         }
@@ -434,7 +434,7 @@ func UintGtZero(a uint8, b uint16, c uint32, d uint64) int {
  }
  
  func UintLeqZero(a uint8, b uint16, c uint32, d uint64) int {
-       // arm64: `CBNZW`, `CBZW`, `CBZ`, -`(CMPW|CMP|BHI|BLS)`
+       // arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BHI|BLS)`
         if a <= 0 || b <= 0 || c <= 0 || d <= 0 {
                 return 1
         }
@@ -442,7 +442,7 @@ func UintLeqZero(a uint8, b uint16, c uint32, d uint64) int {
  }
  
  func UintLtOne(a uint8, b uint16, c uint32, d uint64) int {
-       // arm64: `CBNZW`, `CBZW`, `CBZW`, `CBZ`, -`(CMPW|CMP|BHS|BLO)`
+       // arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BHS|BLO)`
         if a < 1 || b < 1 || c < 1 || d < 1 {
                 return 1
         }
@@ -450,7 +450,7 @@ func UintLtOne(a uint8, b uint16, c uint32, d uint64) int {
  }
  
  func UintGeqOne(a uint8, b uint16, c uint32, d uint64) int {
-       // arm64: `CBZW`, `CBNZW`, `CBNZ`, -`(CMPW|CMP|BLO|BHS)`
+       // arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BLO|BHS)`
         if a >= 1 || b >= 1 || c >= 1 || d >= 1 {
                 return 1
         }
author	erifan01 <eric.fang@arm.com>
	Thu, 23 Jul 2020 02:24:56 +0000 (10:24 +0800)
committer	eric fang <eric.fang@arm.com>
	Tue, 16 Mar 2021 02:44:54 +0000 (02:44 +0000)
src/cmd/compile/internal/ssa/block.go		patch \| blob \| history
src/cmd/compile/internal/ssa/layout.go		patch \| blob \| history
src/cmd/compile/internal/ssa/looprotate.go		patch \| blob \| history
src/cmd/compile/internal/ssa/regalloc.go		patch \| blob \| history
test/codegen/arithmetic.go		patch \| blob \| history
test/codegen/comparisons.go		patch \| blob \| history