From 4d6722a8fd40e86822535b82e0dc9d2b5fd25b74 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Sun, 9 Mar 2025 20:14:32 +0000 Subject: [PATCH] cmd/compile: match more patterns for shortcircuit MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This CL tries to generalize the pattern matching of certain shortcircuit-able CFGs a bit more: For a shortcircuit-able CFG: p q \ / b / \ t u Where the constant branch is t, and b has multiple phi values other than the control phi. For the non-constant branch target u, we try to match the "diamond" shape CFG: p q \ / b / \ t u \ / m or p q \ / b |\ | u |/ m Instead of matching u as a single block, we know try to generalize it as a subgraph that satisfy condition: it's a DAG that has a single entry point u, and has a path to m. compilebench stats: │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ Template 109.4m ± 3% 109.8m ± 3% ~ (p=0.796 n=10) Unicode 94.23m ± 1% 93.85m ± 1% ~ (p=0.631 n=10) GoTypes 538.2m ± 1% 538.8m ± 1% ~ (p=0.912 n=10) Compiler 90.21m ± 1% 90.02m ± 1% ~ (p=0.436 n=10) SSA 3.318 ± 1% 3.323 ± 1% ~ (p=1.000 n=10) Flate 69.38m ± 1% 69.57m ± 2% ~ (p=0.529 n=10) GoParser 128.5m ± 1% 127.4m ± 1% ~ (p=0.075 n=10) Reflect 267.4m ± 1% 267.2m ± 2% ~ (p=0.739 n=10) Tar 127.7m ± 2% 126.4m ± 1% ~ (p=0.353 n=10) XML 149.5m ± 1% 149.6m ± 2% ~ (p=0.684 n=10) LinkCompiler 390.0m ± 1% 388.4m ± 2% ~ (p=0.353 n=10) ExternalLinkCompiler 1.296 ± 0% 1.296 ± 1% ~ (p=0.971 n=10) LinkWithoutDebugCompiler 226.3m ± 1% 225.5m ± 1% ~ (p=0.393 n=10) StdCmd 13.26 ± 0% 13.25 ± 1% ~ (p=0.529 n=10) geomean 319.3m 318.8m -0.17% │ old.txt │ new.txt │ │ user-sec/op │ user-sec/op vs base │ Template 293.1m ± 3% 291.4m ± 11% ~ (p=0.436 n=10) Unicode 91.09m ± 5% 87.61m ± 7% ~ (p=0.165 n=10) GoTypes 1.932 ± 3% 1.926 ± 3% ~ (p=0.739 n=10) Compiler 125.8m ± 3% 121.5m ± 10% ~ (p=0.481 n=10) SSA 18.93 ± 3% 18.89 ± 1% ~ (p=0.684 n=10) Flate 158.5m ± 5% 160.0m ± 7% ~ (p=0.971 n=10) GoParser 316.0m ± 9% 327.4m ± 7% ~ (p=0.052 n=10) Reflect 845.6m ± 6% 861.6m ± 3% ~ (p=0.579 n=10) Tar 358.1m ± 5% 348.5m ± 4% ~ (p=0.089 n=10) XML 382.4m ± 4% 392.2m ± 3% ~ (p=0.143 n=10) LinkCompiler 609.1m ± 4% 627.9m ± 3% ~ (p=0.123 n=10) ExternalLinkCompiler 1.336 ± 2% 1.343 ± 4% ~ (p=0.565 n=10) LinkWithoutDebugCompiler 248.7m ± 3% 248.0m ± 1% ~ (p=0.853 n=10) geomean 506.4m 506.8m +0.08% │ old.txt │ new.txt │ │ text-bytes │ text-bytes vs base │ HelloSize 965.8Ki ± 0% 965.0Ki ± 0% -0.08% (p=0.000 n=10) CmdGoSize 12.30Mi ± 0% 12.29Mi ± 0% -0.08% (p=0.000 n=10) geomean 3.406Mi 3.403Mi -0.08% │ old.txt │ new.txt │ │ data-bytes │ data-bytes vs base │ HelloSize 15.08Ki ± 0% 15.08Ki ± 0% ~ (p=1.000 n=10) ¹ CmdGoSize 408.5Ki ± 0% 408.5Ki ± 0% ~ (p=1.000 n=10) ¹ geomean 78.49Ki 78.49Ki +0.00% ¹ all samples are equal │ old.txt │ new.txt │ │ bss-bytes │ bss-bytes vs base │ HelloSize 142.0Ki ± 0% 142.0Ki ± 0% ~ (p=1.000 n=10) ¹ CmdGoSize 206.4Ki ± 0% 206.4Ki ± 0% ~ (p=1.000 n=10) ¹ geomean 171.2Ki 171.2Ki +0.00% ¹ all samples are equal │ old.txt │ new.txt │ │ exe-bytes │ exe-bytes vs base │ HelloSize 1.466Mi ± 0% 1.462Mi ± 0% -0.27% (p=0.000 n=10) CmdGoSize 18.19Mi ± 0% 18.17Mi ± 0% -0.10% (p=0.000 n=10) geomean 5.164Mi 5.154Mi -0.18% Fixes #72132 Change-Id: I3d1cb10b6a158c5750adc23c79709d63dbd771f3 Reviewed-on: https://go-review.googlesource.com/c/go/+/656255 Auto-Submit: Junyang Shao LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall Reviewed-by: Keith Randall --- src/cmd/compile/internal/ssa/shortcircuit.go | 217 +++++++++++++------ 1 file changed, 150 insertions(+), 67 deletions(-) diff --git a/src/cmd/compile/internal/ssa/shortcircuit.go b/src/cmd/compile/internal/ssa/shortcircuit.go index d7d0b6fe33..b86596026b 100644 --- a/src/cmd/compile/internal/ssa/shortcircuit.go +++ b/src/cmd/compile/internal/ssa/shortcircuit.go @@ -135,6 +135,8 @@ func shortcircuitBlock(b *Block) bool { // to reason about the values of phis. return false } + // We only process blocks with only phi values except for control + // value and its wrappers. if len(b.Values) != nval+nOtherPhi { return false } @@ -293,78 +295,85 @@ func shortcircuitPhiPlan(b *Block, ctl *Value, cidx int, ti int64) func(*Value, // In these cases, we can reconstruct what the value // of any phi in b must be in the successor blocks. - if len(t.Preds) == 1 && len(t.Succs) == 1 && - len(u.Preds) == 1 && len(u.Succs) == 1 && - t.Succs[0].b == u.Succs[0].b && len(t.Succs[0].b.Preds) == 2 { - // p q - // \ / - // b - // / \ - // t u - // \ / - // m - // - // After the CFG modifications, this will look like - // - // p q - // | / - // | b - // |/ \ - // t u - // \ / - // m - // - // NB: t.Preds is (b, p), not (p, b). + if len(t.Preds) == 1 && len(t.Succs) == 1 && len(u.Preds) == 1 && + len(t.Succs[0].b.Preds) == 2 { m := t.Succs[0].b - return func(v *Value, i int) { - // Replace any uses of v in t and u with the value v must have, - // given that we have arrived at that block. - // Then move v to m and adjust its value accordingly; - // this handles all other uses of v. - argP, argQ := v.Args[cidx], v.Args[1^cidx] - u.replaceUses(v, argQ) - phi := t.Func.newValue(OpPhi, v.Type, t, v.Pos) - phi.AddArg2(argQ, argP) - t.replaceUses(v, phi) - if v.Uses == 0 { - return - } - v.moveTo(m, i) - // The phi in m belongs to whichever pred idx corresponds to t. - if m.Preds[0].b == t { - v.SetArgs2(phi, argQ) - } else { - v.SetArgs2(argQ, phi) + if visited := u.flowsTo(m, 5); visited != nil { + // p q + // \ / + // b + // / \ + // t U (sub graph that satisfy condition in flowsTo) + // \ / + // m + // + // After the CFG modifications, this will look like + // + // p q + // | / + // | b + // |/ \ + // t U + // \ / + // m + // + // NB: t.Preds is (b, p), not (p, b). + return func(v *Value, i int) { + // Replace any uses of v in t and u with the value v must have, + // given that we have arrived at that block. + // Then move v to m and adjust its value accordingly; + // this handles all other uses of v. + argP, argQ := v.Args[cidx], v.Args[1^cidx] + phi := t.Func.newValue(OpPhi, v.Type, t, v.Pos) + phi.AddArg2(argQ, argP) + t.replaceUses(v, phi) + for bb := range visited { + bb.replaceUses(v, argQ) + } + if v.Uses == 0 { + return + } + v.moveTo(m, i) + // The phi in m belongs to whichever pred idx corresponds to t. + if m.Preds[0].b == t { + v.SetArgs2(phi, argQ) + } else { + v.SetArgs2(argQ, phi) + } } } } - if len(t.Preds) == 2 && len(u.Preds) == 1 && len(u.Succs) == 1 && u.Succs[0].b == t { - // p q - // \ / - // b - // |\ - // | u - // |/ - // t - // - // After the CFG modifications, this will look like - // - // q - // / - // b - // |\ - // p | u - // \|/ - // t - // - // NB: t.Preds is (b or u, b or u, p). - return func(v *Value, i int) { - // Replace any uses of v in u. Then move v to t. - argP, argQ := v.Args[cidx], v.Args[1^cidx] - u.replaceUses(v, argQ) - v.moveTo(t, i) - v.SetArgs3(argQ, argQ, argP) + if len(t.Preds) == 2 && len(u.Preds) == 1 { + if visited := u.flowsTo(t, 5); visited != nil { + // p q + // \ / + // b + // |\ + // | U ((sub graph that satisfy condition in flowsTo)) + // |/ + // t + // + // After the CFG modifications, this will look like + // + // q + // / + // b + // |\ + // p | U + // \|/ + // t + // + // NB: t.Preds is (b or U, b or U, p). + return func(v *Value, i int) { + // Replace any uses of v in U. Then move v to t. + argP, argQ := v.Args[cidx], v.Args[1^cidx] + for bb := range visited { + bb.replaceUses(v, argQ) + } + v.moveTo(t, i) + v.SetArgs3(argQ, argQ, argP) + } } } @@ -511,3 +520,77 @@ func (v *Value) moveTo(dst *Block, i int) { src.Values[last] = nil src.Values = src.Values[:last] } + +// flowsTo checks that the subgraph starting from v and ends at t is a DAG, with +// the following constraints: +// +// (1) v can reach t. +// (2) v's connected component removing the paths containing t is a DAG. +// (3) The blocks in the subgraph G defined in (2) has all their preds also in G, +// except v. +// (4) The subgraph defined in (2) has a size smaller than cap. +// +// We know that the subgraph G defined in constraint (2)(3) has the property that v +// dominates all the blocks in G: +// If there exist a block x in G that is not dominated by v, then there exist a +// path P from entry to x that does not contain v. Denote x's predecessor in P +// as x', then x' must also be in G given constraint (3), same to its pred x'' +// in P. Given constraint (2), by going back in P we will in the end reach v, +// which conflicts with the definition of P. +// +// Constraint (2)'s DAG requirement could be further relaxed to contain "internal" +// loops that doesn't change the dominance relation of v. But that is more subtle +// and requires another constraint on the source block v, and a more complex proof. +// Furthermore optimizing the branch guarding a loop might bring less gains as the +// loop itself might be the bottleneck. +func (v *Block) flowsTo(t *Block, cap int) map[*Block]struct{} { + seen := map[*Block]struct{}{} + var boundedDFS func(b *Block) + hasPathToT := false + fullyExplored := true + isDAG := true + visited := map[*Block]struct{}{} + boundedDFS = func(b *Block) { + if _, ok := seen[b]; ok { + return + } + if _, ok := visited[b]; ok { + isDAG = false + return + } + if b == t { + // do not put t into seen, this way + // if v can reach t's connected component without going through t, + // it will fail the pred check after boundedDFSUntil. + hasPathToT = true + return + } + if len(seen) > cap { + fullyExplored = false + return + } + seen[b] = struct{}{} + visited[b] = struct{}{} + for _, se := range b.Succs { + boundedDFS(se.b) + if !(isDAG && fullyExplored) { + return + } + } + delete(visited, b) + } + boundedDFS(v) + if hasPathToT && fullyExplored && isDAG { + for b := range seen { + if b != v { + for _, se := range b.Preds { + if _, ok := seen[se.b]; !ok { + return nil + } + } + } + } + return seen + } + return nil +} -- 2.51.0