From ad8447bed94ccb89338b05e7e38f7d53874f0340 Mon Sep 17 00:00:00 2001 From: David Chase Date: Mon, 27 Jul 2020 16:46:35 -0400 Subject: [PATCH] cmd/compile: fix late call expansion for SSA-able aggregate results and arguments This change incorporates the decision that it should be possible to run call expansion relatively late in the optimization chain, so that (1) calls themselves can be exposed to useful optimizations (2) the effect of selectors on aggregates is seen at the rewrite, so that assignment of parts into registers is less complicated (at least I hope it works that way). That means that selectors feeding into SelectN need to be processed, and Make* feeding into call parameters need to be processed. This does however require that call expansion run before decompose builtins. This doesn't yet handle rewrites of strings, slices, interfaces, and complex numbers. Passes run.bash and race.bash Change-Id: I71ff23d3c491043beb30e926949970c4f63ef1a4 Reviewed-on: https://go-review.googlesource.com/c/go/+/245133 Trust: David Chase Run-TryBot: David Chase TryBot-Result: Go Bot Reviewed-by: Cherry Zhang --- src/cmd/compile/internal/gc/ssa.go | 11 +- src/cmd/compile/internal/ssa/compile.go | 2 +- src/cmd/compile/internal/ssa/config.go | 8 + src/cmd/compile/internal/ssa/expand_calls.go | 392 ++++++++++++++++--- src/cmd/compile/internal/ssa/writebarrier.go | 6 +- 5 files changed, 360 insertions(+), 59 deletions(-) diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 815ff7f99f..aebb40568c 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -10,7 +10,6 @@ import ( "html" "os" "sort" - "strings" "bufio" "bytes" @@ -2560,19 +2559,19 @@ func (s *state) expr(n *Node) *ssa.Value { if s.prevCall == nil || s.prevCall.Op != ssa.OpStaticLECall { // Do the old thing addr := s.constOffPtrSP(types.NewPtr(n.Type), n.Xoffset) - return s.load(n.Type, addr) + return s.rawLoad(n.Type, addr) } which := s.prevCall.Aux.(*ssa.AuxCall).ResultForOffset(n.Xoffset) if which == -1 { // Do the old thing // TODO: Panic instead. addr := s.constOffPtrSP(types.NewPtr(n.Type), n.Xoffset) - return s.load(n.Type, addr) + return s.rawLoad(n.Type, addr) } if canSSAType(n.Type) { return s.newValue1I(ssa.OpSelectN, n.Type, which, s.prevCall) } else { addr := s.newValue1I(ssa.OpSelectNAddr, types.NewPtr(n.Type), which, s.prevCall) - return s.load(n.Type, addr) + return s.rawLoad(n.Type, addr) } case ODEREF: @@ -4377,7 +4376,7 @@ func (s *state) call(n *Node, k callKind, returnResultAddr bool) *ssa.Value { case OCALLFUNC: if k == callNormal && fn.Op == ONAME && fn.Class() == PFUNC { sym = fn.Sym - if !returnResultAddr && strings.Contains(sym.Name, "testLateExpansion") { + if !returnResultAddr && ssa.LateCallExpansionEnabledWithin(s.f) { testLateExpansion = true } break @@ -4394,7 +4393,7 @@ func (s *state) call(n *Node, k callKind, returnResultAddr bool) *ssa.Value { } if k == callNormal { sym = fn.Sym - if !returnResultAddr && strings.Contains(sym.Name, "testLateExpansion") { + if !returnResultAddr && ssa.LateCallExpansionEnabledWithin(s.f) { testLateExpansion = true } break diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go index 4eed612977..3dec1cd85b 100644 --- a/src/cmd/compile/internal/ssa/compile.go +++ b/src/cmd/compile/internal/ssa/compile.go @@ -431,9 +431,9 @@ var passes = [...]pass{ {name: "nilcheckelim", fn: nilcheckelim}, {name: "prove", fn: prove}, {name: "early fuse", fn: fuseEarly}, + {name: "expand calls", fn: expandCalls, required: true}, {name: "decompose builtin", fn: decomposeBuiltIn, required: true}, {name: "softfloat", fn: softfloat, required: true}, - {name: "expand calls", fn:expandCalls, required: true}, {name: "late opt", fn: opt, required: true}, // TODO: split required rules and optimizing rules {name: "dead auto elim", fn: elimDeadAutosGeneric}, {name: "generic deadcode", fn: deadcode, required: true}, // remove dead stores, which otherwise mess up store chain diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go index 7f01f8047f..a73bcf8fca 100644 --- a/src/cmd/compile/internal/ssa/config.go +++ b/src/cmd/compile/internal/ssa/config.go @@ -196,6 +196,14 @@ const ( ClassParamOut // return value ) +const go116lateCallExpansion = true + +// LateCallExpansionEnabledWithin returns true if late call expansion should be tested +// within compilation of a function/method triggered by GOSSAHASH (defaults to "yes"). +func LateCallExpansionEnabledWithin(f *Func) bool { + return go116lateCallExpansion && f.DebugTest // Currently set up for GOSSAHASH bug searches +} + // NewConfig returns a new configuration object for the given architecture. func NewConfig(arch string, types Types, ctxt *obj.Link, optimize bool) *Config { c := &Config{arch: arch, Types: types} diff --git a/src/cmd/compile/internal/ssa/expand_calls.go b/src/cmd/compile/internal/ssa/expand_calls.go index 34cff51c00..8456dbab8d 100644 --- a/src/cmd/compile/internal/ssa/expand_calls.go +++ b/src/cmd/compile/internal/ssa/expand_calls.go @@ -4,14 +4,59 @@ package ssa -import "cmd/compile/internal/types" +import ( + "cmd/compile/internal/types" + "cmd/internal/src" + "fmt" + "sort" +) // expandCalls converts LE (Late Expansion) calls that act like they receive value args into a lower-level form -// that is more oriented to a platform's ABI. The SelectN operations that extract results are also rewritten into -// more appropriate forms. +// that is more oriented to a platform's ABI. The SelectN operations that extract results are rewritten into +// more appropriate forms, and any StructMake or ArrayMake inputs are decomposed until non-struct values are +// reached (for now, Strings, Slices, Complex, and Interface are not decomposed because they are rewritten in +// a subsequent phase, but that may need to change for a register ABI in case one of those composite values is +// split between registers and memory). +// +// TODO: when it comes time to use registers, might want to include builtin selectors as well, but currently that happens in lower. func expandCalls(f *Func) { + if !LateCallExpansionEnabledWithin(f) { + return + } canSSAType := f.fe.CanSSA + regSize := f.Config.RegSize sp, _ := f.spSb() + + debug := f.pass.debug > 0 + + // For 32-bit, need to deal with decomposition of 64-bit integers + tUint32 := types.Types[types.TUINT32] + tInt32 := types.Types[types.TINT32] + var hiOffset, lowOffset int64 + if f.Config.BigEndian { + lowOffset = 4 + } else { + hiOffset = 4 + } + pairTypes := func(et types.EType) (tHi, tLo *types.Type) { + tHi = tUint32 + if et == types.TINT64 { + tHi = tInt32 + } + tLo = tUint32 + return + } + + // isAlreadyExpandedAggregateType returns whether a type is an SSA-able "aggregate" (multiple register) type + // that was expanded in an earlier phase (small user-defined arrays and structs, lowered in decomposeUser). + // Other aggregate types are expanded in decomposeBuiltin, which comes later. + isAlreadyExpandedAggregateType := func(t *types.Type) bool { + if !canSSAType(t) { + return false + } + return t.IsStruct() || t.IsArray() || regSize == 4 && t.Size() > 4 && t.IsInteger() + } + // Calls that need lowering have some number of inputs, including a memory input, // and produce a tuple of (value1, value2, ..., mem) where valueK may or may not be SSA-able. @@ -21,80 +66,329 @@ func expandCalls(f *Func) { // With the current ABI, the outputs need to be converted to loads, which will all use the call's // memory output as their input. - // Step 1: find all references to calls as values and rewrite those. - for _, b := range f.Blocks { - for _, v := range b.Values { - switch v.Op { - case OpSelectN: - call := v.Args[0] - aux := call.Aux.(*AuxCall) - which := v.AuxInt - t := v.Type - if which == aux.NResults() { // mem is after the results. - // rewrite v as a Copy of call -- the replacement call will produce a mem. - v.copyOf(call) - } else { - pt := types.NewPtr(t) - if canSSAType(t) { - off := f.ConstOffPtrSP(pt, aux.OffsetOfResult(which), sp) - v.reset(OpLoad) - v.SetArgs2(off, call) + // rewriteSelect recursively walks leaf selector to a root (OpSelectN) through + // a chain of Struct/Array Select operations. If the chain of selectors does not + // end in OpSelectN, it does nothing (this can happen depending on compiler phase ordering). + // It emits the code necessary to implement the leaf select operation that leads to the call. + // TODO when registers really arrive, must also decompose anything split across two registers or registers and memory. + var rewriteSelect func(leaf *Value, selector *Value, offset int64) + rewriteSelect = func(leaf *Value, selector *Value, offset int64) { + switch selector.Op { + case OpSelectN: + // TODO these may be duplicated. Should memoize. Intermediate selectors will go dead, no worries there. + call := selector.Args[0] + aux := call.Aux.(*AuxCall) + which := selector.AuxInt + if which == aux.NResults() { // mem is after the results. + // rewrite v as a Copy of call -- the replacement call will produce a mem. + leaf.copyOf(call) + } else { + leafType := leaf.Type + pt := types.NewPtr(leafType) + if canSSAType(leafType) { + off := f.ConstOffPtrSP(pt, offset+aux.OffsetOfResult(which), sp) + // Any selection right out of the arg area/registers has to be same Block as call, use call as mem input. + if leaf.Block == call.Block { + leaf.reset(OpLoad) + leaf.SetArgs2(off, call) } else { - panic("Should not have non-SSA-able OpSelectN") + w := call.Block.NewValue2(leaf.Pos, OpLoad, leafType, off, call) + leaf.copyOf(w) } + } else { + panic("Should not have non-SSA-able OpSelectN") } - v.Type = t // not right for the mem operand yet, but will be when call is rewritten. + } + case OpStructSelect: + w := selector.Args[0] + if w.Type.Etype != types.TSTRUCT { + fmt.Printf("Bad type for w:\nv=%v\nsel=%v\nw=%v\n,f=%s\n", leaf.LongString(), selector.LongString(), w.LongString(), f.Name) + } + rewriteSelect(leaf, w, offset+w.Type.FieldOff(int(selector.AuxInt))) - case OpSelectNAddr: - call := v.Args[0] - which := v.AuxInt - aux := call.Aux.(*AuxCall) - pt := v.Type - off := f.ConstOffPtrSP(pt, aux.OffsetOfResult(which), sp) - v.copyOf(off) + case OpInt64Hi: + w := selector.Args[0] + rewriteSelect(leaf, w, offset+hiOffset) + + case OpInt64Lo: + w := selector.Args[0] + rewriteSelect(leaf, w, offset+lowOffset) + + case OpArraySelect: + w := selector.Args[0] + rewriteSelect(leaf, w, offset+selector.Type.Size()*selector.AuxInt) + default: + // Ignore dead ends; on 32-bit, these can occur running before decompose builtins. + } + } + + // storeArg converts stores of SSA-able aggregates into a series of stores of smaller types into + // individual parameter slots. + // TODO when registers really arrive, must also decompose anything split across two registers or registers and memory. + var storeArg func(pos src.XPos, b *Block, a *Value, t *types.Type, offset int64, mem *Value) *Value + storeArg = func(pos src.XPos, b *Block, a *Value, t *types.Type, offset int64, mem *Value) *Value { + switch a.Op { + case OpArrayMake0, OpStructMake0: + return mem + case OpStructMake1, OpStructMake2, OpStructMake3, OpStructMake4: + for i := 0; i < t.NumFields(); i++ { + fld := t.Field(i) + mem = storeArg(pos, b, a.Args[i], fld.Type, offset+fld.Offset, mem) + } + return mem + case OpArrayMake1: + return storeArg(pos, b, a.Args[0], t.Elem(), offset, mem) + + case OpInt64Make: + tHi, tLo := pairTypes(t.Etype) + mem = storeArg(pos, b, a.Args[0], tHi, offset+hiOffset, mem) + return storeArg(pos, b, a.Args[1], tLo, offset+lowOffset, mem) + } + dst := f.ConstOffPtrSP(types.NewPtr(t), offset, sp) + x := b.NewValue3A(pos, OpStore, types.TypeMem, t, dst, a, mem) + if debug { + fmt.Printf("storeArg(%v) returns %s\n", a, x.LongString()) + } + return x + } + + // offsetFrom creates an offset from a pointer, simplifying chained offsets and offsets from SP + // TODO should also optimize offsets from SB? + offsetFrom := func(dst *Value, offset int64, t *types.Type) *Value { + pt := types.NewPtr(t) + if offset == 0 && dst.Type == pt { // this is not actually likely + return dst + } + if dst.Op != OpOffPtr { + return dst.Block.NewValue1I(dst.Pos.WithNotStmt(), OpOffPtr, pt, offset, dst) + } + // Simplify OpOffPtr + from := dst.Args[0] + offset += dst.AuxInt + if from == sp { + return f.ConstOffPtrSP(pt, offset, sp) + } + return dst.Block.NewValue1I(dst.Pos.WithNotStmt(), OpOffPtr, pt, offset, from) + } + + // splitStore converts a store of an SSA-able aggregate into a series of smaller stores, emitting + // appropriate Struct/Array Select operations (which will soon go dead) to obtain the parts. + var splitStore func(dst, src, mem, v *Value, t *types.Type, offset int64, firstStorePos src.XPos) *Value + splitStore = func(dst, src, mem, v *Value, t *types.Type, offset int64, firstStorePos src.XPos) *Value { + // TODO might be worth commoning up duplicate selectors, but since they go dead, maybe no point. + pos := v.Pos.WithNotStmt() + switch t.Etype { + case types.TINT64, types.TUINT64: + if t.Width == regSize { + break } + tHi, tLo := pairTypes(t.Etype) + sel := src.Block.NewValue1(pos, OpInt64Hi, tHi, src) + mem = splitStore(dst, sel, mem, v, tHi, offset+hiOffset, firstStorePos) + firstStorePos = firstStorePos.WithNotStmt() + sel = src.Block.NewValue1(pos, OpInt64Lo, tLo, src) + return splitStore(dst, sel, mem, v, tLo, offset+lowOffset, firstStorePos) + + case types.TARRAY: + elt := t.Elem() + for i := int64(0); i < t.NumElem(); i++ { + sel := src.Block.NewValue1I(pos, OpArraySelect, elt, i, src) + mem = splitStore(dst, sel, mem, v, elt, offset+i*elt.Width, firstStorePos) + firstStorePos = firstStorePos.WithNotStmt() + } + return mem + case types.TSTRUCT: + if src.Op == OpIData && t.NumFields() == 1 && t.Field(0).Type.Width == t.Width && t.Width == regSize { + // This peculiar test deals with accesses to immediate interface data. + // It works okay because everything is the same size. + // Example code that triggers this can be found in go/constant/value.go, function ToComplex + // v119 (+881) = IData v6 + // v121 (+882) = StaticLECall {AuxCall{"".itof([intVal,0])[floatVal,8]}} [16] v119 v1 + // This corresponds to the generic rewrite rule "(StructSelect [0] (IData x)) => (IData x)" + // Guard against "struct{struct{*foo}}" + for t.Etype == types.TSTRUCT && t.NumFields() == 1 { + t = t.Field(0).Type + } + if t.Etype == types.TSTRUCT || t.Etype == types.TARRAY { + f.Fatalf("Did not expect to find IDATA-immediate with non-trivial struct in it") + } + break // handle the leaf type. + } + for i := 0; i < t.NumFields(); i++ { + fld := t.Field(i) + sel := src.Block.NewValue1I(pos, OpStructSelect, fld.Type, int64(i), src) + mem = splitStore(dst, sel, mem, v, fld.Type, offset+fld.Offset, firstStorePos) + firstStorePos = firstStorePos.WithNotStmt() + } + return mem } + // Default, including for aggregates whose single element exactly fills their container + // TODO this will be a problem for cast interfaces containing floats when we move to registers. + x := v.Block.NewValue3A(firstStorePos, OpStore, types.TypeMem, t, offsetFrom(dst, offset, t), src, mem) + if debug { + fmt.Printf("splitStore(%v, %v, %v, %v) returns %s\n", dst, src, mem, v, x.LongString()) + } + return x } - // Step 2: rewrite the calls + // Step 0: rewrite the calls to convert incoming args to stores. for _, b := range f.Blocks { for _, v := range b.Values { switch v.Op { case OpStaticLECall: // Thread the stores on the memory arg - m0 := v.Args[len(v.Args)-1] + m0 := v.MemoryArg() mem := m0 pos := v.Pos.WithNotStmt() aux := v.Aux.(*AuxCall) - auxInt := v.AuxInt for i, a := range v.Args { - if a == m0 { + if a == m0 { // mem is last. break } if a.Op == OpDereference { // "Dereference" of addressed (probably not-SSA-eligible) value becomes Move + // TODO this will be more complicated with registers in the picture. + if a.MemoryArg() != m0 { + f.Fatalf("Op...LECall and OpDereference have mismatched mem, %s and %s", v.LongString(), a.LongString()) + } src := a.Args[0] dst := f.ConstOffPtrSP(src.Type, aux.OffsetOfArg(int64(i)), sp) - a.reset(OpMove) - a.Pos = pos - a.Type = types.TypeMem - a.Aux = aux.TypeOfArg(int64(i)) - a.AuxInt = aux.SizeOfArg(int64(i)) - a.SetArgs3(dst, src, mem) - mem = a + if a.Uses == 1 { + a.reset(OpMove) + a.Pos = pos + a.Type = types.TypeMem + a.Aux = aux.TypeOfArg(int64(i)) + a.AuxInt = aux.SizeOfArg(int64(i)) + a.SetArgs3(dst, src, mem) + mem = a + } else { + mem = a.Block.NewValue3A(pos, OpMove, types.TypeMem, aux.TypeOfArg(int64(i)), dst, src, mem) + mem.AuxInt = aux.SizeOfArg(int64(i)) + } } else { - // Add a new store. - t := aux.TypeOfArg(int64(i)) - dst := f.ConstOffPtrSP(types.NewPtr(t), aux.OffsetOfArg(int64(i)), sp) - mem = b.NewValue3A(pos, OpStore, types.TypeMem, t, dst, a, mem) + mem = storeArg(pos, b, a, aux.TypeOfArg(int64(i)), aux.OffsetOfArg(int64(i)), mem) } } - v.reset(OpStaticCall) - v.Type = types.TypeMem - v.Aux = aux - v.AuxInt = auxInt + v.resetArgs() v.SetArgs1(mem) } } } + + // Step 1: any stores of aggregates remaining are believed to be sourced from call results. + // Decompose those stores into a series of smaller stores, adding selection ops as necessary. + for _, b := range f.Blocks { + for _, v := range b.Values { + if v.Op == OpStore { + t := v.Aux.(*types.Type) + if isAlreadyExpandedAggregateType(t) { + dst, src, mem := v.Args[0], v.Args[1], v.Args[2] + mem = splitStore(dst, src, mem, v, t, 0, v.Pos) + v.copyOf(mem) + } + } + } + } + + val2Preds := make(map[*Value]int32) // Used to accumulate dependency graph of selection operations for topological ordering. + + // Step 2: accumulate selection operations for rewrite in topological order. + // Any select-for-addressing applied to call results can be transformed directly. + // TODO this is overkill; with the transformation of aggregate references into series of leaf references, it is only necessary to remember and recurse on the leaves. + for _, b := range f.Blocks { + for _, v := range b.Values { + // Accumulate chains of selectors for processing in topological order + switch v.Op { + case OpStructSelect, OpArraySelect, OpInt64Hi, OpInt64Lo: + w := v.Args[0] + switch w.Op { + case OpStructSelect, OpArraySelect, OpInt64Hi, OpInt64Lo, OpSelectN: + val2Preds[w] += 1 + if debug { + fmt.Printf("v2p[%s] = %d\n", w.LongString(), val2Preds[w]) + } + } + fallthrough + case OpSelectN: + if _, ok := val2Preds[v]; !ok { + val2Preds[v] = 0 + if debug { + fmt.Printf("v2p[%s] = %d\n", v.LongString(), val2Preds[v]) + } + } + case OpSelectNAddr: + // Do these directly, there are no chains of selectors. + call := v.Args[0] + which := v.AuxInt + aux := call.Aux.(*AuxCall) + pt := v.Type + off := f.ConstOffPtrSP(pt, aux.OffsetOfResult(which), sp) + v.copyOf(off) + } + } + } + + // Compilation must be deterministic + var ordered []*Value + less := func(i, j int) bool { return ordered[i].ID < ordered[j].ID } + + // Step 3: Rewrite in topological order. All chains of selectors end up in same block as the call. + for len(val2Preds) > 0 { + ordered = ordered[:0] + for v, n := range val2Preds { + if n == 0 { + ordered = append(ordered, v) + } + } + sort.Slice(ordered, less) + for _, v := range ordered { + for { + w := v.Args[0] + if debug { + fmt.Printf("About to rewrite %s, args[0]=%s\n", v.LongString(), w.LongString()) + } + delete(val2Preds, v) + rewriteSelect(v, v, 0) + v = w + n, ok := val2Preds[v] + if !ok { + break + } + if n != 1 { + val2Preds[v] = n - 1 + break + } + // Loop on new v; val2Preds[v] == 1 will be deleted in that iteration, no need to store zero. + } + } + } + + // Step 4: rewrite the calls themselves, correcting the type + for _, b := range f.Blocks { + for _, v := range b.Values { + switch v.Op { + case OpStaticLECall: + v.Op = OpStaticCall + v.Type = types.TypeMem + } + } + } + + // Step 5: elide any copies introduced. + for _, b := range f.Blocks { + for _, v := range b.Values { + for i, a := range v.Args { + if a.Op != OpCopy { + continue + } + aa := copySource(a) + v.SetArg(i, aa) + for a.Uses == 0 { + b := a.Args[0] + a.reset(OpInvalid) + a = b + } + } + } + } } diff --git a/src/cmd/compile/internal/ssa/writebarrier.go b/src/cmd/compile/internal/ssa/writebarrier.go index df54a45b0f..849c9e8967 100644 --- a/src/cmd/compile/internal/ssa/writebarrier.go +++ b/src/cmd/compile/internal/ssa/writebarrier.go @@ -527,7 +527,7 @@ func IsStackAddr(v *Value) bool { v = v.Args[0] } switch v.Op { - case OpSP, OpLocalAddr: + case OpSP, OpLocalAddr, OpSelectNAddr: return true } return false @@ -593,7 +593,7 @@ func IsSanitizerSafeAddr(v *Value) bool { v = v.Args[0] } switch v.Op { - case OpSP, OpLocalAddr: + case OpSP, OpLocalAddr, OpSelectNAddr: // Stack addresses are always safe. return true case OpITab, OpStringPtr, OpGetClosurePtr: @@ -609,7 +609,7 @@ func IsSanitizerSafeAddr(v *Value) bool { // isVolatile reports whether v is a pointer to argument region on stack which // will be clobbered by a function call. func isVolatile(v *Value) bool { - for v.Op == OpOffPtr || v.Op == OpAddPtr || v.Op == OpPtrIndex || v.Op == OpCopy { + for v.Op == OpOffPtr || v.Op == OpAddPtr || v.Op == OpPtrIndex || v.Op == OpCopy || v.Op == OpSelectNAddr { v = v.Args[0] } return v.Op == OpSP -- 2.48.1