From 3c36b8be660f409142a00ecd57115399ce8aabc9 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Fri, 20 Apr 2018 15:48:46 -0400 Subject: [PATCH] cmd/compile: incrementally compact liveness maps MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The per-Value slice of liveness maps is currently one of the largest sources of allocation in the compiler. On cmd/compile/internal/ssa, it's 5% of overall allocation, or 75MB in total. Enabling liveness maps everywhere significantly increased this allocation footprint, which in turn slowed down the compiler. Improve this by compacting the liveness maps after every block is processed. There are typically very few distinct liveness maps, so compacting the maps after every block, rather than at the end of the function, can significantly reduce these allocations. Passes toolstash -cmp. name old time/op new time/op delta Template 198ms ± 2% 196ms ± 1% -1.11% (p=0.008 n=9+10) Unicode 100ms ± 1% 99ms ± 1% -0.94% (p=0.015 n=8+9) GoTypes 703ms ± 2% 695ms ± 1% -1.15% (p=0.000 n=10+10) Compiler 3.38s ± 3% 3.33s ± 0% -1.66% (p=0.000 n=10+9) SSA 7.96s ± 1% 7.93s ± 1% ~ (p=0.113 n=9+10) Flate 134ms ± 1% 132ms ± 1% -1.30% (p=0.000 n=8+10) GoParser 165ms ± 2% 163ms ± 1% -1.32% (p=0.013 n=9+10) Reflect 462ms ± 2% 459ms ± 0% -0.65% (p=0.036 n=9+8) Tar 188ms ± 2% 186ms ± 1% ~ (p=0.173 n=8+10) XML 243ms ± 7% 239ms ± 1% ~ (p=0.684 n=10+10) [Geo mean] 421ms 416ms -1.10% name old alloc/op new alloc/op delta Template 38.0MB ± 0% 36.5MB ± 0% -3.98% (p=0.000 n=10+10) Unicode 30.3MB ± 0% 29.6MB ± 0% -2.21% (p=0.000 n=10+10) GoTypes 125MB ± 0% 120MB ± 0% -4.51% (p=0.000 n=10+9) Compiler 575MB ± 0% 546MB ± 0% -5.06% (p=0.000 n=10+10) SSA 1.64GB ± 0% 1.55GB ± 0% -4.97% (p=0.000 n=10+10) Flate 25.9MB ± 0% 25.0MB ± 0% -3.41% (p=0.000 n=10+10) GoParser 30.7MB ± 0% 29.5MB ± 0% -3.97% (p=0.000 n=10+10) Reflect 84.1MB ± 0% 81.9MB ± 0% -2.64% (p=0.000 n=10+10) Tar 37.0MB ± 0% 35.8MB ± 0% -3.27% (p=0.000 n=10+9) XML 47.2MB ± 0% 45.0MB ± 0% -4.57% (p=0.000 n=10+10) [Geo mean] 83.2MB 79.9MB -3.86% name old allocs/op new allocs/op delta Template 337k ± 0% 337k ± 0% -0.06% (p=0.000 n=10+10) Unicode 340k ± 0% 340k ± 0% -0.01% (p=0.014 n=10+10) GoTypes 1.18M ± 0% 1.18M ± 0% -0.04% (p=0.000 n=10+10) Compiler 4.97M ± 0% 4.97M ± 0% -0.03% (p=0.000 n=10+10) SSA 12.3M ± 0% 12.3M ± 0% -0.01% (p=0.000 n=10+10) Flate 226k ± 0% 225k ± 0% -0.09% (p=0.000 n=10+10) GoParser 283k ± 0% 283k ± 0% -0.06% (p=0.000 n=10+9) Reflect 972k ± 0% 971k ± 0% -0.04% (p=0.000 n=10+8) Tar 333k ± 0% 332k ± 0% -0.05% (p=0.000 n=10+9) XML 395k ± 0% 395k ± 0% -0.04% (p=0.000 n=10+10) [Geo mean] 764k 764k -0.04% Updates #24543. Change-Id: I6fdc46e4ddb6a8eea95d38242345205eb8397f0b Reviewed-on: https://go-review.googlesource.com/110177 Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/cmd/compile/internal/gc/bv.go | 8 +- src/cmd/compile/internal/gc/plive.go | 105 +++++++++++++++------------ 2 files changed, 65 insertions(+), 48 deletions(-) diff --git a/src/cmd/compile/internal/gc/bv.go b/src/cmd/compile/internal/gc/bv.go index 7f5a432249..e9db35ede2 100644 --- a/src/cmd/compile/internal/gc/bv.go +++ b/src/cmd/compile/internal/gc/bv.go @@ -267,7 +267,7 @@ func (m *bvecSet) grow() { m.index = newIndex } -// add adds bv to the set and returns its index in m.uniq. +// add adds bv to the set and returns its index in m.extractUniqe. // The caller must not modify bv after this. func (m *bvecSet) add(bv bvec) int { if len(m.uniq)*4 >= len(m.index) { @@ -296,3 +296,9 @@ func (m *bvecSet) add(bv bvec) int { } } } + +// extractUniqe returns this slice of unique bit vectors in m, as +// indexed by the result of bvecSet.add. +func (m *bvecSet) extractUniqe() []bvec { + return m.uniq +} diff --git a/src/cmd/compile/internal/gc/plive.go b/src/cmd/compile/internal/gc/plive.go index f63530235f..4228e80c4d 100644 --- a/src/cmd/compile/internal/gc/plive.go +++ b/src/cmd/compile/internal/gc/plive.go @@ -117,8 +117,11 @@ type Liveness struct { // unsafePoints bit i is set if Value ID i is not a safe point. unsafePoints bvec - // An array with a bit vector for each safe point tracking live variables. - // Indexed sequentially by safe points in Block and Value order. + // An array with a bit vector for each safe point in the + // current Block during Liveness.epilogue. Indexed in Value + // order for that block. Additionally, for the entry block + // livevars[0] is the entry bitmap. Liveness.compact moves + // these to stackMaps and regMaps. livevars []varRegVec // livenessMap maps from safe points (i.e., CALLs) to their @@ -127,7 +130,9 @@ type Liveness struct { // TODO(austin): Now that we have liveness at almost every PC, // should this be a dense structure? livenessMap LivenessMap + stackMapSet bvecSet stackMaps []bvec + regMapSet map[liveRegMask]int regMaps []liveRegMask cache progeffectscache @@ -491,6 +496,9 @@ func newliveness(fn *Node, f *ssa.Func, vars []*Node, idx map[*Node]int32, stkpt idx: idx, stkptrsize: stkptrsize, be: make([]BlockEffects, f.NumBlocks()), + + livenessMap: LivenessMap{make(map[*ssa.Value]LivenessIndex)}, + regMapSet: make(map[liveRegMask]int), } nblocks := int32(len(f.Blocks)) @@ -975,6 +983,13 @@ func (lv *Liveness) epilogue() { } } + // We must analyze the entry block first. The runtime assumes + // the function entry map is index 0. Conveniently, layout + // already ensured that the entry block is first. + if lv.f.Entry != lv.f.Blocks[0] { + lv.f.Fatalf("entry block must be first") + } + { // Reserve an entry for function entry. live := bvalloc(nvars) @@ -1040,11 +1055,6 @@ func (lv *Liveness) epilogue() { // walk backward, construct maps at each safe point index := int32(len(lv.livevars) - 1) - if index < 0 { - // the first block we encounter should have the ATEXT so - // at no point should pos ever be less than zero. - Fatalf("livenessepilogue") - } liveout.Copy(be.liveout) for i := len(b.Values) - 1; i >= 0; i-- { @@ -1097,13 +1107,20 @@ func (lv *Liveness) epilogue() { index++ } } + + // The liveness maps for this block are now complete. Compact them. + lv.compact(b) } + // Done compacting. Throw out the stack map set. + lv.stackMaps = lv.stackMapSet.extractUniqe() + lv.stackMapSet = bvecSet{} + // Useful sanity check: on entry to the function, // the only things that can possibly be live are the // input parameters. for j, n := range lv.vars { - if n.Class() != PPARAM && lv.livevars[0].vars.Get(int32(j)) { + if n.Class() != PPARAM && lv.stackMaps[0].Get(int32(j)) { Fatalf("internal error: %v %L recorded as live on entry", lv.fn.Func.Nname, n) } } @@ -1111,7 +1128,7 @@ func (lv *Liveness) epilogue() { // The context register, if any, comes from a // LoweredGetClosurePtr operation first thing in the function, // so it doesn't appear live at entry. - if regs := lv.livevars[0].regs; regs != 0 { + if regs := lv.regMaps[0]; regs != 0 { lv.printDebug() lv.f.Fatalf("internal error: %v register %s recorded as live on entry", lv.fn.Func.Nname, regs.niceString(lv.f.Config)) } @@ -1292,8 +1309,10 @@ func (lv *Liveness) avarinitanyall(b *ssa.Block, any, all bvec) { } } -// Compact liveness information by coalescing identical per-call-site bitmaps. -// The merging only happens for a single function, not across the entire binary. +// Compact coalesces identical bitmaps from lv.livevars into the sets +// lv.stackMapSet and lv.regMaps. +// +// Compact clears lv.livevars. // // There are actually two lists of bitmaps, one list for the local variables and one // list for the function arguments. Both lists are indexed by the same PCDATA @@ -1306,47 +1325,34 @@ func (lv *Liveness) avarinitanyall(b *ssa.Block, any, all bvec) { // is actually a net loss: we save about 50k of argument bitmaps but the new // PCDATA tables cost about 100k. So for now we keep using a single index for // both bitmap lists. -func (lv *Liveness) compact() { - // Compact livevars. - // remap[i] = the index in lv.stackMaps of for bitmap lv.livevars[i]. - remap := make([]int, len(lv.livevars)) - set := newBvecSet(len(lv.livevars)) - for i, live := range lv.livevars { - remap[i] = set.add(live.vars) - } - lv.stackMaps = set.uniq - - // Compact register maps. - remapRegs := make([]int, len(lv.livevars)) - regMaps := make(map[liveRegMask]int) - for i, live := range lv.livevars { - idx, ok := regMaps[live.regs] +func (lv *Liveness) compact(b *ssa.Block) { + add := func(live varRegVec) LivenessIndex { + // Deduplicate the stack map. + stackIndex := lv.stackMapSet.add(live.vars) + // Deduplicate the register map. + regIndex, ok := lv.regMapSet[live.regs] if !ok { - idx = len(regMaps) - regMaps[live.regs] = idx + regIndex = len(lv.regMapSet) + lv.regMapSet[live.regs] = regIndex lv.regMaps = append(lv.regMaps, live.regs) } - remapRegs[i] = idx + return LivenessIndex{stackIndex, regIndex} } - - // Clear lv.livevars to allow GC of duplicate maps and to - // prevent accidental use. - lv.livevars = nil - - // Record compacted stack map indexes for each value. - // These will later become PCDATA instructions. - lv.showlive(nil, lv.stackMaps[0]) - pos := 1 - lv.livenessMap = LivenessMap{make(map[*ssa.Value]LivenessIndex)} - for _, b := range lv.f.Blocks { - for _, v := range b.Values { - if lv.issafepoint(v) { - lv.showlive(v, lv.stackMaps[remap[pos]]) - lv.livenessMap.m[v] = LivenessIndex{remap[pos], remapRegs[pos]} - pos++ - } + pos := 0 + if b == lv.f.Entry { + // Handle entry stack map. + add(lv.livevars[0]) + pos++ + } + for _, v := range b.Values { + if lv.issafepoint(v) { + lv.livenessMap.m[v] = add(lv.livevars[pos]) + pos++ } } + + // Reset livevars. + lv.livevars = lv.livevars[:0] } func (lv *Liveness) showlive(v *ssa.Value, live bvec) { @@ -1647,8 +1653,13 @@ func liveness(e *ssafn, f *ssa.Func) LivenessMap { lv.prologue() lv.solve() lv.epilogue() - lv.compact() lv.clobber() + if debuglive > 0 { + lv.showlive(nil, lv.stackMaps[0]) + for val, idx := range lv.livenessMap.m { + lv.showlive(val, lv.stackMaps[idx.stackMapIndex]) + } + } if debuglive >= 2 { lv.printDebug() } -- 2.48.1