From: Keith Randall Date: Wed, 7 Sep 2016 21:04:31 +0000 (-0700) Subject: cmd/compile: improve tighten pass X-Git-Tag: go1.8beta1~1217 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=dd24b1098ae8923b98a13560c89ae59fc0c49258;p=gostls13.git cmd/compile: improve tighten pass Move a value to the block which is the lowest common ancestor in the dominator tree of all of its uses. Make sure not to move a value into a loop. Makes the tighten pass on average (across go1 benchmarks) 40% slower. Still not a big contributor to overall compile time. Binary size is just a tad smaller. name old time/op new time/op delta BinaryTree17-12 2.77s ± 9% 2.76s ± 9% ~ (p=0.878 n=8+8) Fannkuch11-12 2.75s ± 1% 2.74s ± 1% ~ (p=0.232 n=8+7) FmtFprintfEmpty-12 48.9ns ± 9% 47.7ns ± 0% ~ (p=0.431 n=8+8) FmtFprintfString-12 143ns ± 8% 142ns ± 1% ~ (p=0.257 n=8+7) FmtFprintfInt-12 123ns ± 1% 122ns ± 1% -1.04% (p=0.026 n=7+8) FmtFprintfIntInt-12 195ns ± 7% 185ns ± 0% -5.32% (p=0.000 n=8+8) FmtFprintfPrefixedInt-12 194ns ± 4% 195ns ± 0% +0.81% (p=0.015 n=7+7) FmtFprintfFloat-12 267ns ± 0% 268ns ± 0% +0.37% (p=0.001 n=7+6) FmtManyArgs-12 800ns ± 0% 762ns ± 1% -4.78% (p=0.000 n=8+8) GobDecode-12 7.67ms ± 2% 7.60ms ± 2% ~ (p=0.234 n=8+8) GobEncode-12 6.55ms ± 0% 6.57ms ± 1% ~ (p=0.336 n=7+8) Gzip-12 237ms ± 0% 238ms ± 0% +0.40% (p=0.017 n=7+7) Gunzip-12 40.8ms ± 0% 40.2ms ± 0% -1.52% (p=0.000 n=7+8) HTTPClientServer-12 208µs ± 3% 209µs ± 3% ~ (p=0.955 n=8+7) JSONEncode-12 16.2ms ± 1% 17.2ms ±11% +5.80% (p=0.001 n=7+8) JSONDecode-12 57.3ms ±12% 55.5ms ± 3% ~ (p=0.867 n=8+7) Mandelbrot200-12 4.68ms ± 6% 4.46ms ± 1% ~ (p=0.442 n=8+8) GoParse-12 4.27ms ±44% 3.42ms ± 1% -19.95% (p=0.005 n=8+8) RegexpMatchEasy0_32-12 75.1ns ± 0% 75.8ns ± 1% +0.99% (p=0.002 n=7+7) RegexpMatchEasy0_1K-12 963ns ± 0% 1021ns ± 6% +5.98% (p=0.001 n=7+7) RegexpMatchEasy1_32-12 72.4ns ±11% 70.8ns ± 1% ~ (p=0.368 n=8+8) RegexpMatchEasy1_1K-12 394ns ± 1% 399ns ± 0% +1.23% (p=0.000 n=8+7) RegexpMatchMedium_32-12 114ns ± 0% 115ns ± 1% +0.63% (p=0.021 n=7+7) RegexpMatchMedium_1K-12 35.9µs ± 0% 37.6µs ± 1% +4.72% (p=0.000 n=7+8) RegexpMatchHard_32-12 1.93µs ± 2% 1.91µs ± 0% -0.91% (p=0.001 n=7+7) RegexpMatchHard_1K-12 60.2µs ± 3% 61.2µs ±10% ~ (p=0.442 n=8+8) Revcomp-12 404ms ± 1% 406ms ± 1% ~ (p=0.054 n=8+7) Template-12 64.6ms ± 1% 63.5ms ± 1% -1.66% (p=0.000 n=8+8) TimeParse-12 347ns ± 8% 309ns ± 0% -11.13% (p=0.000 n=8+7) TimeFormat-12 343ns ± 4% 331ns ± 0% -3.34% (p=0.000 n=8+7) Change-Id: Id6da1239ddd4d0cb074ff29cffb06302d1c6d08f Reviewed-on: https://go-review.googlesource.com/28712 Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot Reviewed-by: David Chase --- diff --git a/src/cmd/compile/internal/ssa/fuse.go b/src/cmd/compile/internal/ssa/fuse.go index afb8bb21f8..d5940da439 100644 --- a/src/cmd/compile/internal/ssa/fuse.go +++ b/src/cmd/compile/internal/ssa/fuse.go @@ -135,9 +135,11 @@ func fuseBlockPlain(b *Block) bool { p := e.b p.Succs[e.i] = Edge{c, i} } - if f := b.Func; f.Entry == b { + f := b.Func + if f.Entry == b { f.Entry = c } + f.invalidateCFG() // trash b, just in case b.Kind = BlockInvalid diff --git a/src/cmd/compile/internal/ssa/lca.go b/src/cmd/compile/internal/ssa/lca.go new file mode 100644 index 0000000000..ca9470302b --- /dev/null +++ b/src/cmd/compile/internal/ssa/lca.go @@ -0,0 +1,123 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +// Code to compute lowest common ancestors in the dominator tree. +// https://en.wikipedia.org/wiki/Lowest_common_ancestor +// https://en.wikipedia.org/wiki/Range_minimum_query#Solution_using_constant_time_and_linearithmic_space + +// lcaRange is a data structure that can compute lowest common ancestor queries +// in O(n lg n) precomputed space and O(1) time per query. +type lcaRange struct { + // Additional information about each block (indexed by block ID). + blocks []lcaRangeBlock + + // Data structure for range minimum queries. + // rangeMin[k][i] contains the ID of the minimum depth block + // in the Euler tour from positions i to i+1< 0 { + n := len(q) - 1 + bid := q[n].bid + cid := q[n].cid + q = q[:n] + + // Add block to tour. + blocks[bid].pos = int32(len(tour)) + tour = append(tour, bid) + + // Proceed down next child edge (if any). + if cid == 0 { + // This is our first visit to b. Set its depth. + blocks[bid].depth = blocks[blocks[bid].parent].depth + 1 + // Then explore its first child. + cid = blocks[bid].firstChild + } else { + // We've seen b before. Explore the next child. + cid = blocks[cid].sibling + } + if cid != 0 { + q = append(q, queueEntry{bid, cid}, queueEntry{cid, 0}) + } + } + + // Compute fast range-minimum query data structure + var rangeMin [][]ID + rangeMin = append(rangeMin, tour) // 1-size windows are just the tour itself. + for logS, s := 1, 2; s < len(tour); logS, s = logS+1, s*2 { + r := make([]ID, len(tour)-s+1) + for i := 0; i < len(tour)-s+1; i++ { + bid := rangeMin[logS-1][i] + bid2 := rangeMin[logS-1][i+s/2] + if blocks[bid2].depth < blocks[bid].depth { + bid = bid2 + } + r[i] = bid + } + rangeMin = append(rangeMin, r) + } + + return &lcaRange{blocks: blocks, rangeMin: rangeMin} +} + +// find returns the lowest common ancestor of a and b. +func (lca *lcaRange) find(a, b *Block) *Block { + if a == b { + return a + } + // Find the positions of a and bin the Euler tour. + p1 := lca.blocks[a.ID].pos + p2 := lca.blocks[b.ID].pos + if p1 > p2 { + p1, p2 = p2, p1 + } + + // The lowest common ancestor is the minimum depth block + // on the tour from p1 to p2. We've precomputed minimum + // depth blocks for powers-of-two subsequences of the tour. + // Combine the right two precomputed values to get the answer. + logS := uint(log2(int64(p2 - p1))) + bid1 := lca.rangeMin[logS][p1] + bid2 := lca.rangeMin[logS][p2-1< db { + da-- + a = lca.parent[a.ID] + } + for da < db { + db-- + b = lca.parent[b.ID] + } + for a != b { + a = lca.parent[a.ID] + b = lca.parent[b.ID] + } + return a +} + +func (lca *lcaEasy) depth(b *Block) int { + n := 0 + for b != nil { + b = lca.parent[b.ID] + n++ + } + return n +} diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 89b3d706dc..5af58d6ad8 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -189,6 +189,7 @@ func nto(x int64) int64 { } // log2 returns logarithm in base of uint64(n), with log2(0) = -1. +// Rounds down. func log2(n int64) (l int64) { l = -1 x := uint64(n) diff --git a/src/cmd/compile/internal/ssa/rewrite_test.go b/src/cmd/compile/internal/ssa/rewrite_test.go index b786df887b..7bd32ff1b2 100644 --- a/src/cmd/compile/internal/ssa/rewrite_test.go +++ b/src/cmd/compile/internal/ssa/rewrite_test.go @@ -92,6 +92,9 @@ func TestLog2(t *testing.T) { {1, 0}, {2, 1}, {4, 2}, + {7, 2}, + {8, 3}, + {9, 3}, {1024, 10}} for _, tc := range log2Tests { diff --git a/src/cmd/compile/internal/ssa/tighten.go b/src/cmd/compile/internal/ssa/tighten.go index 07f0375889..bed1704dc3 100644 --- a/src/cmd/compile/internal/ssa/tighten.go +++ b/src/cmd/compile/internal/ssa/tighten.go @@ -7,90 +7,135 @@ package ssa // tighten moves Values closer to the Blocks in which they are used. // This can reduce the amount of register spilling required, // if it doesn't also create more live values. -// For now, it handles only the trivial case in which a -// Value with one or fewer args is only used in a single Block, -// and not in a phi value. -// TODO: Do something smarter. // A Value can be moved to any block that // dominates all blocks in which it is used. -// Figure out when that will be an improvement. func tighten(f *Func) { - // For each value, the number of blocks in which it is used. - uses := make([]int32, f.NumValues()) + canMove := make([]bool, f.NumValues()) + for _, b := range f.Blocks { + for _, v := range b.Values { + switch v.Op { + case OpPhi, OpGetClosurePtr, OpArg, OpSelect0, OpSelect1: + // Phis need to stay in their block. + // GetClosurePtr & Arg must stay in the entry block. + // Tuple selectors must stay with the tuple generator. + continue + } + if len(v.Args) > 0 && v.Args[len(v.Args)-1].Type.IsMemory() { + // We can't move values which have a memory arg - it might + // make two memory values live across a block boundary. + continue + } + // Count arguments which will need a register. + narg := 0 + for _, a := range v.Args { + switch a.Op { + case OpConst8, OpConst16, OpConst32, OpConst64, OpAddr: + // Probably foldable into v, don't count as an argument needing a register. + // TODO: move tighten to a machine-dependent phase and use v.rematerializeable()? + default: + narg++ + } + } + if narg >= 2 && !v.Type.IsBoolean() { + // Don't move values with more than one input, as that may + // increase register pressure. + // We make an exception for boolean-typed values, as they will + // likely be converted to flags, and we want flag generators + // moved next to uses (because we only have 1 flag register). + continue + } + canMove[v.ID] = true + } + } + + // Build data structure for fast least-common-ancestor queries. + lca := makeLCArange(f) - // For each value, whether that value is ever an arg to a phi value. - phi := make([]bool, f.NumValues()) + // For each moveable value, record the block that dominates all uses found so far. + target := make([]*Block, f.NumValues()) - // For each value, one block in which that value is used. - home := make([]*Block, f.NumValues()) + // Grab loop information. + // We use this to make sure we don't tighten a value into a (deeper) loop. + idom := f.idom() + loops := f.loopnest() + loops.calculateDepths() changed := true for changed { changed = false - // Reset uses - for i := range uses { - uses[i] = 0 + // Reset target + for i := range target { + target[i] = nil } - // No need to reset home; any relevant values will be written anew anyway. - // No need to reset phi; once used in a phi, always used in a phi. + // Compute target locations (for moveable values only). + // target location = the least common ancestor of all uses in the dominator tree. for _, b := range f.Blocks { for _, v := range b.Values { - for _, w := range v.Args { + for i, a := range v.Args { + if !canMove[a.ID] { + continue + } + use := b if v.Op == OpPhi { - phi[w.ID] = true + use = b.Preds[i].b + } + if target[a.ID] == nil { + target[a.ID] = use + } else { + target[a.ID] = lca.find(target[a.ID], use) } - uses[w.ID]++ - home[w.ID] = b } } - if b.Control != nil { - uses[b.Control.ID]++ - home[b.Control.ID] = b + if c := b.Control; c != nil { + if !canMove[c.ID] { + continue + } + if target[c.ID] == nil { + target[c.ID] = b + } else { + target[c.ID] = lca.find(target[c.ID], b) + } } } + // If the target location is inside a loop, + // move the target location up to just before the loop head. for _, b := range f.Blocks { - for i := 0; i < len(b.Values); i++ { - v := b.Values[i] - switch v.Op { - case OpPhi, OpGetClosurePtr, OpConvert, OpArg: - // GetClosurePtr & Arg must stay in entry block. - // OpConvert must not float over call sites. - // TODO do we instead need a dependence edge of some sort for OpConvert? - // Would memory do the trick, or do we need something else that relates - // to safe point operations? + origloop := loops.b2l[b.ID] + for _, v := range b.Values { + t := target[v.ID] + if t == nil { continue - default: } - if v.Op == OpSelect0 || v.Op == OpSelect1 { - // tuple selector must stay with tuple generator - continue + targetloop := loops.b2l[t.ID] + for targetloop != nil && (origloop == nil || targetloop.depth > origloop.depth) { + t = idom[targetloop.header.ID] + target[v.ID] = t + targetloop = loops.b2l[t.ID] } - if len(v.Args) > 0 && v.Args[len(v.Args)-1].Type.IsMemory() { - // We can't move values which have a memory arg - it might - // make two memory values live across a block boundary. + } + } + + // Move values to target locations. + for _, b := range f.Blocks { + for i := 0; i < len(b.Values); i++ { + v := b.Values[i] + t := target[v.ID] + if t == nil || t == b { + // v is not moveable, or is already in correct place. continue } - if uses[v.ID] == 1 && !phi[v.ID] && home[v.ID] != b && (len(v.Args) < 2 || v.Type.IsBoolean()) { - // v is used in exactly one block, and it is not b. - // Furthermore, it takes at most one input, - // so moving it will not increase the - // number of live values anywhere. - // Move v to that block. - // Also move bool generators even if they have more than 1 input. - // They will likely be converted to flags, and we want flag - // generators moved next to uses (because we only have 1 flag register). - c := home[v.ID] - c.Values = append(c.Values, v) - v.Block = c - last := len(b.Values) - 1 - b.Values[i] = b.Values[last] - b.Values[last] = nil - b.Values = b.Values[:last] - changed = true - } + // Move v to the block which dominates its uses. + t.Values = append(t.Values, v) + v.Block = t + last := len(b.Values) - 1 + b.Values[i] = b.Values[last] + b.Values[last] = nil + b.Values = b.Values[:last] + changed = true + i-- } } } diff --git a/src/cmd/compile/internal/ssa/trim.go b/src/cmd/compile/internal/ssa/trim.go index 8ffb459074..9b57b5a31e 100644 --- a/src/cmd/compile/internal/ssa/trim.go +++ b/src/cmd/compile/internal/ssa/trim.go @@ -23,6 +23,7 @@ func trim(f *Func) { j := b.Succs[0].i p.Succs[i] = Edge{s, j} s.Preds[j] = Edge{p, i} + f.invalidateCFG() } tail := f.Blocks[n:] for i := range tail {