From 39eea62340a129154d3c2c2347386e8af762d6d1 Mon Sep 17 00:00:00 2001 From: Heschi Kreinick Date: Mon, 22 Jan 2018 17:06:26 -0500 Subject: [PATCH] cmd/compile/internal/ssa: reduce location list memory use MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Put everything that showed up in the allocation profile into the cache, and reuse it across functions. After this CL, the overhead of enabling location lists is getting pretty close to the desired 5%: compilecmp -all -beforeflags -dwarflocationlists=0 -afterflags -dwarflocationlists=1 -n 30 4ebad42292b6a4090faf37753dd768d2965e38c4 4ebad42292b6a4090faf37753dd768d2965e38c4 compilecmp -dwarflocationlists=0 4ebad42292b6a4090faf37753dd768d2965e38c4 -dwarflocationlists=1 4ebad42292b6a4090faf37753dd768d2965e38c4 benchstat -geomean /tmp/869550129 /tmp/143495132 completed 30 of 30, estimated time remaining 0s (eta 3:24PM) name old time/op new time/op delta Template 199ms ± 4% 209ms ± 6% +5.17% (p=0.000 n=29+30) Unicode 99.2ms ± 8% 100.5ms ± 6% ~ (p=0.112 n=30+30) GoTypes 642ms ± 3% 684ms ± 3% +6.54% (p=0.000 n=29+30) SSA 8.00s ± 1% 8.71s ± 1% +8.78% (p=0.000 n=29+29) Flate 129ms ± 7% 134ms ± 5% +3.77% (p=0.000 n=30+30) GoParser 157ms ± 4% 164ms ± 5% +4.35% (p=0.000 n=29+30) Reflect 428ms ± 3% 450ms ± 4% +5.09% (p=0.000 n=30+30) Tar 195ms ± 5% 204ms ± 8% +4.78% (p=0.000 n=30+30) XML 228ms ± 4% 241ms ± 4% +5.62% (p=0.000 n=30+29) StdCmd 15.4s ± 1% 16.7s ± 1% +8.29% (p=0.000 n=29+29) [Geo mean] 476ms 502ms +5.35% name old user-time/op new user-time/op delta Template 294ms ±18% 304ms ±15% ~ (p=0.242 n=29+29) Unicode 182ms ±27% 172ms ±28% ~ (p=0.104 n=30+30) GoTypes 957ms ±15% 1016ms ±12% +6.16% (p=0.000 n=30+30) SSA 13.3s ± 5% 14.3s ± 3% +7.32% (p=0.000 n=30+28) Flate 188ms ±17% 193ms ±17% ~ (p=0.288 n=28+29) GoParser 232ms ±16% 238ms ±13% ~ (p=0.065 n=30+29) Reflect 585ms ±13% 620ms ±10% +5.88% (p=0.000 n=30+30) Tar 298ms ±21% 332ms ±23% +11.32% (p=0.000 n=30+30) XML 329ms ±17% 343ms ±12% +4.18% (p=0.032 n=30+30) [Geo mean] 492ms 513ms +4.13% name old alloc/op new alloc/op delta Template 38.3MB ± 0% 40.3MB ± 0% +5.29% (p=0.000 n=30+30) Unicode 29.3MB ± 0% 29.6MB ± 0% +1.28% (p=0.000 n=30+29) GoTypes 110MB ± 0% 118MB ± 0% +6.97% (p=0.000 n=29+30) SSA 1.48GB ± 0% 1.61GB ± 0% +9.06% (p=0.000 n=30+30) Flate 24.8MB ± 0% 26.0MB ± 0% +4.99% (p=0.000 n=29+30) GoParser 30.9MB ± 0% 32.2MB ± 0% +4.20% (p=0.000 n=30+30) Reflect 76.8MB ± 0% 80.6MB ± 0% +4.97% (p=0.000 n=30+30) Tar 39.6MB ± 0% 41.7MB ± 0% +5.22% (p=0.000 n=29+30) XML 42.0MB ± 0% 45.4MB ± 0% +8.22% (p=0.000 n=29+30) [Geo mean] 63.9MB 67.5MB +5.56% name old allocs/op new allocs/op delta Template 383k ± 0% 405k ± 0% +5.69% (p=0.000 n=30+30) Unicode 343k ± 0% 346k ± 0% +0.98% (p=0.000 n=30+27) GoTypes 1.15M ± 0% 1.22M ± 0% +6.17% (p=0.000 n=29+29) SSA 12.2M ± 0% 13.2M ± 0% +8.15% (p=0.000 n=30+30) Flate 234k ± 0% 249k ± 0% +6.44% (p=0.000 n=30+30) GoParser 315k ± 0% 332k ± 0% +5.31% (p=0.000 n=30+28) Reflect 972k ± 0% 1010k ± 0% +3.89% (p=0.000 n=30+30) Tar 394k ± 0% 415k ± 0% +5.35% (p=0.000 n=28+30) XML 404k ± 0% 429k ± 0% +6.31% (p=0.000 n=29+29) [Geo mean] 651k 686k +5.35% Change-Id: Ia005a8d6b33ce9f8091322f004376a3d6e5c1a94 Reviewed-on: https://go-review.googlesource.com/89357 Run-TryBot: Heschi Kreinick TryBot-Result: Gobot Gobot Reviewed-by: David Chase --- src/cmd/compile/internal/ssa/cache.go | 23 +++ src/cmd/compile/internal/ssa/debug.go | 236 ++++++++++++++++++-------- 2 files changed, 190 insertions(+), 69 deletions(-) diff --git a/src/cmd/compile/internal/ssa/cache.go b/src/cmd/compile/internal/ssa/cache.go index f1018da497..caaac0b387 100644 --- a/src/cmd/compile/internal/ssa/cache.go +++ b/src/cmd/compile/internal/ssa/cache.go @@ -20,6 +20,16 @@ type Cache struct { domblockstore []ID // scratch space for computing dominators scrSparse []*sparseSet // scratch sparse sets to be re-used. + + blockDebug []BlockDebug + valueNames [][]SlotID + slotLocs []VarLoc + regContents [][]SlotID + pendingEntries []pendingEntry + pendingSlotLocs []VarLoc + + liveSlotSliceBegin int + liveSlots []liveSlot } func (c *Cache) Reset() { @@ -38,4 +48,17 @@ func (c *Cache) Reset() { for i := range xl { xl[i] = nil } + + c.liveSlots = c.liveSlots[:0] + c.liveSlotSliceBegin = 0 +} + +func (c *Cache) AppendLiveSlot(ls liveSlot) { + c.liveSlots = append(c.liveSlots, ls) +} + +func (c *Cache) GetLiveSlotSlice() []liveSlot { + s := c.liveSlots[c.liveSlotSliceBegin:] + c.liveSlotSliceBegin = len(c.liveSlots) + return s } diff --git a/src/cmd/compile/internal/ssa/debug.go b/src/cmd/compile/internal/ssa/debug.go index 8ec146287c..048ff7e230 100644 --- a/src/cmd/compile/internal/ssa/debug.go +++ b/src/cmd/compile/internal/ssa/debug.go @@ -58,20 +58,25 @@ type stateAtPC struct { // reset fills state with the live variables from live. func (state *stateAtPC) reset(live []liveSlot) { - for i := range state.slots { - state.slots[i] = VarLoc{} + slots, registers := state.slots, state.registers + for i := range slots { + slots[i] = VarLoc{} } - for i := range state.registers { - state.registers[i] = state.registers[i][:0] + for i := range registers { + registers[i] = registers[i][:0] } for _, live := range live { - state.slots[live.slot] = live.loc - for reg, regMask := 0, 1; reg < len(state.registers); reg, regMask = reg+1, regMask<<1 { + slots[live.slot] = live.loc + if live.loc.Registers == 0 { + continue + } + for reg, regMask := 0, 1; reg < len(registers); reg, regMask = reg+1, regMask<<1 { if live.loc.Registers&RegisterSet(regMask) != 0 { - state.registers[reg] = append(state.registers[reg], SlotID(live.slot)) + registers[reg] = append(registers[reg], SlotID(live.slot)) } } } + state.slots, state.registers = slots, registers } func (b *BlockDebug) LocString(loc VarLoc) string { @@ -162,9 +167,75 @@ type debugState struct { // The current state of whatever analysis is running. currentState stateAtPC + liveCount []int changedVars []bool } +func (state *debugState) initializeCache() { + numBlocks := state.f.NumBlocks() + + // One blockDebug per block. Initialized in allocBlock. + if cap(state.cache.blockDebug) < numBlocks { + state.cache.blockDebug = make([]BlockDebug, numBlocks) + } + // This local variable, and the ones like it below, enable compiler + // optimizations. Don't inline them. + b := state.cache.blockDebug[:numBlocks] + for i := range b { + b[i] = BlockDebug{} + } + + // A list of slots per Value. Reuse the previous child slices. + if cap(state.cache.valueNames) < state.f.NumValues() { + old := state.cache.valueNames + state.cache.valueNames = make([][]SlotID, state.f.NumValues()) + copy(state.cache.valueNames, old) + } + state.valueNames = state.cache.valueNames + vn := state.valueNames[:state.f.NumValues()] + for i := range vn { + vn[i] = vn[i][:0] + } + + // Slot and register contents for currentState. Cleared by reset(). + if cap(state.cache.slotLocs) < len(state.slots) { + state.cache.slotLocs = make([]VarLoc, len(state.slots)) + } + state.currentState.slots = state.cache.slotLocs[:len(state.slots)] + if cap(state.cache.regContents) < len(state.registers) { + state.cache.regContents = make([][]SlotID, len(state.registers)) + } + state.currentState.registers = state.cache.regContents[:len(state.registers)] + + // Used many times by mergePredecessors. + state.liveCount = make([]int, len(state.slots)) + + // A relatively small slice, but used many times as the return from processValue. + state.changedVars = make([]bool, len(state.vars)) + + // A pending entry per user variable, with space to track each of its pieces. + if want := len(state.vars) * len(state.slots); cap(state.cache.pendingSlotLocs) < want { + state.cache.pendingSlotLocs = make([]VarLoc, want) + } + psl := state.cache.pendingSlotLocs[:len(state.vars)*len(state.slots)] + for i := range psl { + psl[i] = VarLoc{} + } + if cap(state.cache.pendingEntries) < len(state.vars) { + state.cache.pendingEntries = make([]pendingEntry, len(state.vars)) + } + pe := state.cache.pendingEntries[:len(state.vars)] + for varID := range pe { + pe[varID] = pendingEntry{ + pieces: state.cache.pendingSlotLocs[varID*len(state.slots) : (varID+1)*len(state.slots)], + } + } +} + +func (state *debugState) allocBlock(b *Block) *BlockDebug { + return &state.cache.blockDebug[b.ID] +} + func (s *debugState) blockEndStateString(b *BlockDebug) string { endState := stateAtPC{slots: make([]VarLoc, len(s.slots)), registers: make([][]SlotID, len(s.slots))} endState.reset(b.endState) @@ -207,14 +278,11 @@ func BuildFuncDebug(ctxt *obj.Link, f *Func, loggingEnabled bool, stackOffset fu loggingEnabled: loggingEnabled, slots: make([]*LocalSlot, len(f.Names)), - f: f, - cache: f.Cache, - registers: f.Config.registers, - stackOffset: stackOffset, - currentState: stateAtPC{make([]VarLoc, len(f.Names)), make([][]SlotID, len(f.Config.registers))}, + f: f, + cache: f.Cache, + registers: f.Config.registers, + stackOffset: stackOffset, } - // TODO: consider storing this in Cache and reusing across functions. - state.valueNames = make([][]SlotID, f.NumValues()) // Recompose any decomposed variables, and record the names associated with each value. varParts := map[GCNode][]SlotID{} @@ -224,9 +292,6 @@ func BuildFuncDebug(ctxt *obj.Link, f *Func, loggingEnabled bool, stackOffset fu if isSynthetic(&slot) { continue } - for _, value := range f.NamedValues[slot] { - state.valueNames[value.ID] = append(state.valueNames[value.ID], SlotID(i)) - } topSlot := &slot for topSlot.SplitOf != nil { @@ -247,8 +312,18 @@ func BuildFuncDebug(ctxt *obj.Link, f *Func, loggingEnabled bool, stackOffset fu for _, slotID := range parts { state.slotVars[slotID] = VarID(varID) } + sort.Sort(partsByVarOffset{parts, state.slots}) + } + + state.initializeCache() + for i, slot := range f.Names { + if isSynthetic(&slot) { + continue + } + for _, value := range f.NamedValues[slot] { + state.valueNames[value.ID] = append(state.valueNames[value.ID], SlotID(i)) + } } - state.changedVars = make([]bool, len(state.vars)) blockLocs := state.liveness() lists := state.buildLocationLists(ctxt, stackOffset, blockLocs) @@ -282,7 +357,7 @@ func (state *debugState) liveness() []*BlockDebug { // Build the starting state for the block from the final // state of its predecessors. locs := state.mergePredecessors(b, blockLocs) - + changed := false if state.loggingEnabled { state.logf("Processing %v, initial state:\n%v", b, state.stateString(locs, state.currentState)) } @@ -314,20 +389,25 @@ func (state *debugState) liveness() []*BlockDebug { } reg, _ := state.f.getHome(v.ID).(*Register) - state.processValue(v, slots, reg) + c := state.processValue(v, slots, reg) + changed = changed || c } if state.loggingEnabled { state.f.Logf("Block %v done, locs:\n%v", b, state.stateString(locs, state.currentState)) } - for slotID, slotLoc := range state.currentState.slots { - if slotLoc.absent() { - continue + if !changed { + locs.endState = locs.startState + } else { + for slotID, slotLoc := range state.currentState.slots { + if slotLoc.absent() { + continue + } + state.cache.AppendLiveSlot(liveSlot{SlotID(slotID), slotLoc}) } - locs.endState = append(locs.endState, liveSlot{SlotID(slotID), slotLoc}) + locs.endState = state.cache.GetLiveSlotSlice() } - blockLocs[b.ID] = locs } return blockLocs @@ -337,7 +417,7 @@ func (state *debugState) liveness() []*BlockDebug { // intersects them to form the starting state for b. It returns that state in // the BlockDebug, and fills state.currentState with it. func (state *debugState) mergePredecessors(b *Block, blockLocs []*BlockDebug) *BlockDebug { - result := &BlockDebug{} + result := state.allocBlock(b) if state.loggingEnabled { result.Block = b } @@ -361,10 +441,10 @@ func (state *debugState) mergePredecessors(b *Block, blockLocs []*BlockDebug) *B return result } + p0 := blockLocs[preds[0].ID].endState if len(preds) == 1 { - p := blockLocs[preds[0].ID] - result.startState = p.endState - state.currentState.reset(p.endState) + result.startState = p0 + state.currentState.reset(p0) return result } @@ -372,18 +452,17 @@ func (state *debugState) mergePredecessors(b *Block, blockLocs []*BlockDebug) *B state.logf("Starting %v with state from %v:\n%v", b, preds[0], state.blockEndStateString(blockLocs[preds[0].ID])) } - count := make([]int, len(state.slots)) slotLocs := state.currentState.slots - for _, predSlot := range blockLocs[preds[0].ID].endState { + for _, predSlot := range p0 { slotLocs[predSlot.slot] = predSlot.loc - count[predSlot.slot] = 1 + state.liveCount[predSlot.slot] = 1 } for i := 1; i < len(preds); i++ { if state.loggingEnabled { state.logf("Merging in state from %v:\n%v", preds[i], state.blockEndStateString(blockLocs[preds[i].ID])) } for _, predSlot := range blockLocs[preds[i].ID].endState { - count[predSlot.slot]++ + state.liveCount[predSlot.slot]++ liveLoc := slotLocs[predSlot.slot] if !liveLoc.OnStack || !predSlot.loc.OnStack || liveLoc.StackOffset != predSlot.loc.StackOffset { liveLoc.OnStack = false @@ -394,6 +473,25 @@ func (state *debugState) mergePredecessors(b *Block, blockLocs []*BlockDebug) *B } } + // Check if the final state is the same as the first predecessor's + // final state, and reuse it if so. In principle it could match any, + // but it's probably not worth checking more than the first. + unchanged := true + for _, predSlot := range p0 { + if state.liveCount[predSlot.slot] != len(preds) || slotLocs[predSlot.slot] != predSlot.loc { + unchanged = false + break + } + } + if unchanged { + if state.loggingEnabled { + state.logf("After merge, %v matches %v exactly.\n", b, preds[0]) + } + result.startState = p0 + state.currentState.reset(p0) + return result + } + for reg := range state.currentState.registers { state.currentState.registers[reg] = state.currentState.registers[reg][:0] } @@ -406,12 +504,12 @@ func (state *debugState) mergePredecessors(b *Block, blockLocs []*BlockDebug) *B continue } // Seen in only some predecessors. Clear it out. - if count[slotID] != len(preds) { + if state.liveCount[slotID] != len(preds) { slotLocs[slotID] = VarLoc{} continue } // Present in all predecessors. - result.startState = append(result.startState, liveSlot{SlotID(slotID), slotLoc}) + state.cache.AppendLiveSlot(liveSlot{SlotID(slotID), slotLoc}) if slotLoc.Registers == 0 { continue } @@ -421,6 +519,7 @@ func (state *debugState) mergePredecessors(b *Block, blockLocs []*BlockDebug) *B } } } + result.startState = state.cache.GetLiveSlotSlice() return result } @@ -428,9 +527,11 @@ func (state *debugState) mergePredecessors(b *Block, blockLocs []*BlockDebug) *B // the names in vSlots and homed in vReg. "v" becomes visible after execution of // the instructions evaluating it. It returns which VarIDs were modified by the // Value's execution. -func (state *debugState) processValue(v *Value, vSlots []SlotID, vReg *Register) { +func (state *debugState) processValue(v *Value, vSlots []SlotID, vReg *Register) bool { locs := state.currentState + changed := false setSlot := func(slot SlotID, loc VarLoc) { + changed = true state.changedVars[state.slotVars[slot]] = true state.currentState.slots[slot] = loc } @@ -527,6 +628,7 @@ func (state *debugState) processValue(v *Value, vSlots []SlotID, vReg *Register) setSlot(slot, loc) } } + return changed } // varOffset returns the offset of slot within the user variable it was @@ -539,17 +641,16 @@ func varOffset(slot *LocalSlot) int64 { return offset } -// This type is deleted in a subsequent CL. -type varPart struct { - varOffset int64 - slot SlotID +type partsByVarOffset struct { + slotIDs []SlotID + slots []*LocalSlot } -type partsByVarOffset []varPart - -func (a partsByVarOffset) Len() int { return len(a) } -func (a partsByVarOffset) Less(i, j int) bool { return a[i].varOffset < a[j].varOffset } -func (a partsByVarOffset) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a partsByVarOffset) Len() int { return len(a.slotIDs) } +func (a partsByVarOffset) Less(i, j int) bool { + return varOffset(a.slots[a.slotIDs[i]]) < varOffset(a.slots[a.slotIDs[i]]) +} +func (a partsByVarOffset) Swap(i, j int) { a.slotIDs[i], a.slotIDs[j] = a.slotIDs[j], a.slotIDs[i] } // A pendingEntry represents the beginning of a location list entry, missing // only its end coordinate. @@ -608,19 +709,7 @@ func firstReg(set RegisterSet) uint8 { // be finished by PutLocationList. func (state *debugState) buildLocationLists(Ctxt *obj.Link, stackOffset func(*LocalSlot) int32, blockLocs []*BlockDebug) [][]byte { lists := make([][]byte, len(state.vars)) - varParts := make([][]varPart, len(lists)) - pendingEntries := make([]pendingEntry, len(lists)) - - for varID, parts := range state.varSlots { - for _, slotID := range parts { - varParts[varID] = append(varParts[varID], varPart{varOffset(state.slots[slotID]), slotID}) - } - // Get the order the parts need to be in to represent the memory - // of the decomposed user variable. - sort.Sort(partsByVarOffset(varParts[varID])) - - pendingEntries[varID].pieces = make([]VarLoc, len(state.slots)) - } + pendingEntries := state.cache.pendingEntries // writePendingEntry writes out the pending entry for varID, if any, // terminated at endBlock/Value. @@ -649,15 +738,15 @@ func (state *debugState) buildLocationLists(Ctxt *obj.Link, stackOffset func(*Lo if state.loggingEnabled { var partStrs []string - for _, part := range varParts[varID] { - partStrs = append(partStrs, fmt.Sprintf("%v@%v", state.slots[part.slot], blockLocs[endBlock].LocString(pending.pieces[part.slot]))) + for _, slot := range state.varSlots[varID] { + partStrs = append(partStrs, fmt.Sprintf("%v@%v", state.slots[slot], blockLocs[endBlock].LocString(pending.pieces[slot]))) } state.logf("Add entry for %v: \tb%vv%v-b%vv%v = \t%v\n", state.vars[varID], pending.startBlock, pending.startValue, endBlock, endValue, strings.Join(partStrs, " ")) } - for _, part := range varParts[varID] { - loc := pending.pieces[part.slot] - slot := state.slots[part.slot] + for _, slotID := range state.varSlots[varID] { + loc := pending.pieces[slotID] + slot := state.slots[slotID] if !loc.absent() { if loc.OnStack { @@ -678,7 +767,7 @@ func (state *debugState) buildLocationLists(Ctxt *obj.Link, stackOffset func(*Lo } } - if len(varParts[varID]) > 1 { + if len(state.varSlots[varID]) > 1 { list = append(list, dwarf.DW_OP_piece) list = dwarf.AppendUleb128(list, uint64(slot.Type.Size())) } @@ -692,8 +781,8 @@ func (state *debugState) buildLocationLists(Ctxt *obj.Link, stackOffset func(*Lo updateVar := func(varID VarID, v *Value, curLoc []VarLoc) { // Assemble the location list entry with whatever's live. empty := true - for _, part := range varParts[varID] { - if !curLoc[part.slot].absent() { + for _, slotID := range state.varSlots[varID] { + if !curLoc[slotID].absent() { empty = false break } @@ -708,8 +797,8 @@ func (state *debugState) buildLocationLists(Ctxt *obj.Link, stackOffset func(*Lo // Extend the previous entry if possible. if pending.present { merge := true - for _, part := range varParts[varID] { - if !canMerge(pending.pieces[part.slot], curLoc[part.slot]) { + for _, slotID := range state.varSlots[varID] { + if !canMerge(pending.pieces[slotID], curLoc[slotID]) { merge = false break } @@ -733,15 +822,24 @@ func (state *debugState) buildLocationLists(Ctxt *obj.Link, stackOffset func(*Lo for _, b := range state.f.Blocks { state.currentState.reset(blockLocs[b.ID].startState) + phisPending := false for _, v := range b.Values { slots := state.valueNames[v.ID] reg, _ := state.f.getHome(v.ID).(*Register) - state.processValue(v, slots, reg) + changed := state.processValue(v, slots, reg) if v.Op == OpPhi { + if changed { + phisPending = true + } + continue + } + + if !changed && !phisPending { continue } + phisPending = false for varID := range state.changedVars { if !state.changedVars[varID] { continue -- 2.50.0