]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile/internal: merge stack slots for selected local auto vars
authorThan McIntosh <thanm@google.com>
Wed, 3 Apr 2024 15:51:42 +0000 (15:51 +0000)
committerThan McIntosh <thanm@google.com>
Tue, 9 Apr 2024 16:41:23 +0000 (16:41 +0000)
[This is a partial roll-forward of CL 553055, the main change here
is that the stack slot overlap operation is flagged off by default
(can be enabled by hand with -gcflags=-d=mergelocals=1) ]

Preliminary compiler support for merging/overlapping stack slots of
local variables whose access patterns are disjoint.

This patch includes changes in AllocFrame to do the actual
merging/overlapping based on information returned from a new
liveness.MergeLocals helper. The MergeLocals helper identifies
candidates by looking for sets of AUTO variables that either A) have
the same size and GC shape (if types contain pointers), or B) have the
same size (but potentially different types as long as those types have
no pointers). Variables must be greater than (3*types.PtrSize) in size
to be considered for merging.

After forming candidates, MergeLocals collects variables into "can be
overlapped" equivalence classes or partitions; this process is driven
by an additional liveness analysis pass. Ideally it would be nice to
move the existing stackmap liveness pass up before AllocFrame
and "widen" it to include merge candidates so that we can do just a
single liveness as opposed to two passes, however this may be difficult
given that the merge-locals liveness has to take into account
writes corresponding to dead stores.

This patch also required a change to the way ssa.OpVarDef pseudo-ops
are generated; prior to this point they would only be created for
variables whose type included pointers; if stack slot merging is
enabled then the ssagen code creates OpVarDef ops for all auto vars
that are merge candidates.

Note that some temporaries created late in the compilation process
(e.g. during ssa backend) are difficult to reason about, especially in
cases where we take the address of a temp and pass it to the runtime.
For the time being we mark most of the vars created post-ssagen as
"not a merge candidate".

Stack slot merging for locals/autos is enabled by default if "-N" is
not in effect, and can be disabled via "-gcflags=-d=mergelocals=0".

Fixmes/todos/restrictions:
- try lowering size restrictions
- re-evaluate the various skips that happen in SSA-created autotmps

Updates #62737.
Updates #65532.
Updates #65495.

Change-Id: Ifda26bc48cde5667de245c8a9671b3f0a30bb45d
Reviewed-on: https://go-review.googlesource.com/c/go/+/575415
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
14 files changed:
src/cmd/compile/internal/base/debug.go
src/cmd/compile/internal/base/flag.go
src/cmd/compile/internal/base/hashdebug.go
src/cmd/compile/internal/ir/name.go
src/cmd/compile/internal/liveness/mergelocals.go [new file with mode: 0644]
src/cmd/compile/internal/liveness/plive.go
src/cmd/compile/internal/ssa/check.go
src/cmd/compile/internal/ssa/func.go
src/cmd/compile/internal/ssagen/pgen.go
src/cmd/compile/internal/ssagen/ssa.go
src/cmd/compile/internal/test/mergelocals_test.go [new file with mode: 0644]
src/cmd/compile/internal/test/testdata/mergelocals/integration.go [new file with mode: 0644]
src/cmd/compile/internal/walk/temp.go
test/fixedbugs/bug385_64.go

index 420ad1305e8c86b8c91498fdc1c90257f0847b2d..08ccef30656747da15cd35a51225581cb2ff8fe5 100644 (file)
@@ -41,6 +41,10 @@ type DebugFlags struct {
        LoopVarHash           string `help:"for debugging changes in loop behavior. Overrides experiment and loopvar flag."`
        LocationLists         int    `help:"print information about DWARF location list creation"`
        MaxShapeLen           int    `help:"hash shape names longer than this threshold (default 500)" concurrent:"ok"`
+       MergeLocals           int    `help:"merge together non-interfering local stack slots" concurrent:"ok"`
+       MergeLocalsDumpFunc   string `help:"dump specified func in merge locals"`
+       MergeLocalsHash       string `help:"hash value for debugging stack slot merging of local variables" concurrent:"ok"`
+       MergeLocalsTrace      int    `help:"trace debug output for locals merging"`
        Nil                   int    `help:"print information about nil checks"`
        NoOpenDefer           int    `help:"disable open-coded defers" concurrent:"ok"`
        NoRefName             int    `help:"do not include referenced symbol names in object file" concurrent:"ok"`
index 5b3c3ad8c6045377ba732644986a021b8c8dd4d4..0889c37b0d40b93036627015b4da9d67bfdf9c8b 100644 (file)
@@ -260,6 +260,9 @@ func ParseFlags() {
        if Debug.PGOHash != "" {
                PGOHash = NewHashDebug("pgohash", Debug.PGOHash, nil)
        }
+       if Debug.MergeLocalsHash != "" {
+               MergeLocalsHash = NewHashDebug("mergelocals", Debug.MergeLocalsHash, nil)
+       }
 
        if Flag.MSan && !platform.MSanSupported(buildcfg.GOOS, buildcfg.GOARCH) {
                log.Fatalf("%s/%s does not support -msan", buildcfg.GOOS, buildcfg.GOARCH)
index 4e36c8d54971cd7c950513dd09cb8b70dead13a7..7a5cc42578ac119ebcb9d3eb7f39de9c4b0c5950 100644 (file)
@@ -53,9 +53,10 @@ func (d *HashDebug) SetInlineSuffixOnly(b bool) *HashDebug {
 // The default compiler-debugging HashDebug, for "-d=gossahash=..."
 var hashDebug *HashDebug
 
-var FmaHash *HashDebug     // for debugging fused-multiply-add floating point changes
-var LoopVarHash *HashDebug // for debugging shared/private loop variable changes
-var PGOHash *HashDebug     // for debugging PGO optimization decisions
+var FmaHash *HashDebug         // for debugging fused-multiply-add floating point changes
+var LoopVarHash *HashDebug     // for debugging shared/private loop variable changes
+var PGOHash *HashDebug         // for debugging PGO optimization decisions
+var MergeLocalsHash *HashDebug // for debugging local stack slot merging changes
 
 // DebugHashMatchPkgFunc reports whether debug variable Gossahash
 //
index 758158651e6c055c47116e2312b2a92286f8c62a..1ce6e43d0b62fd4bfe262ef69d4b20223d59c69c 100644 (file)
@@ -194,6 +194,7 @@ const (
        nameLibfuzzer8BitCounter     // if PEXTERN should be assigned to __sancov_cntrs section
        nameCoverageAuxVar           // instrumentation counter var or pkg ID for cmd/cover
        nameAlias                    // is type name an alias
+       nameNonMergeable             // not a candidate for stack slot merging
 )
 
 func (n *Name) Readonly() bool                 { return n.flags&nameReadonly != 0 }
@@ -209,6 +210,7 @@ func (n *Name) InlLocal() bool                 { return n.flags&nameInlLocal !=
 func (n *Name) OpenDeferSlot() bool            { return n.flags&nameOpenDeferSlot != 0 }
 func (n *Name) Libfuzzer8BitCounter() bool     { return n.flags&nameLibfuzzer8BitCounter != 0 }
 func (n *Name) CoverageAuxVar() bool           { return n.flags&nameCoverageAuxVar != 0 }
+func (n *Name) NonMergeable() bool             { return n.flags&nameNonMergeable != 0 }
 
 func (n *Name) setReadonly(b bool)                 { n.flags.set(nameReadonly, b) }
 func (n *Name) SetNeedzero(b bool)                 { n.flags.set(nameNeedzero, b) }
@@ -223,6 +225,7 @@ func (n *Name) SetInlLocal(b bool)                 { n.flags.set(nameInlLocal, b
 func (n *Name) SetOpenDeferSlot(b bool)            { n.flags.set(nameOpenDeferSlot, b) }
 func (n *Name) SetLibfuzzer8BitCounter(b bool)     { n.flags.set(nameLibfuzzer8BitCounter, b) }
 func (n *Name) SetCoverageAuxVar(b bool)           { n.flags.set(nameCoverageAuxVar, b) }
+func (n *Name) SetNonMergeable(b bool)             { n.flags.set(nameNonMergeable, b) }
 
 // OnStack reports whether variable n may reside on the stack.
 func (n *Name) OnStack() bool {
diff --git a/src/cmd/compile/internal/liveness/mergelocals.go b/src/cmd/compile/internal/liveness/mergelocals.go
new file mode 100644 (file)
index 0000000..a1342ef
--- /dev/null
@@ -0,0 +1,691 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package liveness
+
+import (
+       "cmd/compile/internal/base"
+       "cmd/compile/internal/bitvec"
+       "cmd/compile/internal/ir"
+       "cmd/compile/internal/reflectdata"
+       "cmd/compile/internal/ssa"
+       "cmd/internal/obj"
+       "cmd/internal/src"
+       "fmt"
+       "os"
+       "path/filepath"
+       "sort"
+       "strings"
+)
+
+// MergeLocalsState encapsulates information about which AUTO
+// (stack-allocated) variables within a function can be safely
+// merged/overlapped, e.g. share a stack slot with some other auto).
+// An instance of MergeLocalsState is produced by MergeLocals() below
+// and then consumed in ssagen.AllocFrame. The map 'partition' contains
+// entries of the form <N,SL> where N is an *ir.Name and SL is a slice
+// holding the indices (within 'vars') of other variables that share the
+// same slot. For example, if a function contains five variables where
+// v1/v2/v3 are safe to overlap and v4/v5 are safe to overlap, the
+// MergeLocalsState content might look like
+//
+//     vars: [v1, v2, v3, v4, v5]
+//     partition: v1 -> [1, 0, 2], v2 -> [1, 0, 2], v3 -> [1, 0, 2]
+//                v4 -> [3, 4], v5 -> [3, 4]
+//
+// A nil MergeLocalsState indicates that no local variables meet the
+// necessary criteria for overlap.
+type MergeLocalsState struct {
+       // contains auto vars that participate in overlapping
+       vars []*ir.Name
+       // maps auto variable to overlap partition
+       partition map[*ir.Name][]int
+}
+
+// candRegion is a sub-range (start, end) corresponding to an interval
+// [st,en] within the list of candidate variables.
+type candRegion struct {
+       st, en int
+}
+
+// MergeLocals analyzes the specified ssa function f to determine which
+// of its auto variables can safely share the same stack slot, returning
+// a state object that describes how the overlap should be done.
+func MergeLocals(fn *ir.Func, f *ssa.Func) *MergeLocalsState {
+       cands, idx, regions := collectMergeCandidates(fn)
+       if len(regions) == 0 {
+               return nil
+       }
+       lv := newliveness(fn, f, cands, idx, 0)
+
+       // If we have a local variable such as "r2" below that's written
+       // but then not read, something like:
+       //
+       //      vardef r1
+       //      r1.x = ...
+       //      vardef r2
+       //      r2.x = 0
+       //      r2.y = ...
+       //      <call foo>
+       //      // no subsequent use of r2
+       //      ... = r1.x
+       //
+       // then for the purpose of calculating stack maps at the call, we
+       // can ignore "r2" completely during liveness analysis for stack
+       // maps, however for stack slock merging we most definitely want
+       // to treat the writes as "uses".
+       lv.conservativeWrites = true
+
+       lv.prologue()
+       lv.solve()
+       cs := &cstate{
+               fn:        fn,
+               ibuilders: make([]IntervalsBuilder, len(cands)),
+       }
+       computeIntervals(lv, cs)
+       rv := performMerging(lv, cs, regions)
+       if err := rv.check(); err != nil {
+               base.FatalfAt(fn.Pos(), "invalid mergelocals state: %v", err)
+       }
+       return rv
+}
+
+// Subsumed returns whether variable n is subsumed, e.g. appears
+// in an overlap position but is not the leader in that partition.
+func (mls *MergeLocalsState) Subsumed(n *ir.Name) bool {
+       if sl, ok := mls.partition[n]; ok && mls.vars[sl[0]] != n {
+               return true
+       }
+       return false
+}
+
+// IsLeader returns whether a variable n is the leader (first element)
+// in a sharing partition.
+func (mls *MergeLocalsState) IsLeader(n *ir.Name) bool {
+       if sl, ok := mls.partition[n]; ok && mls.vars[sl[0]] == n {
+               return true
+       }
+       return false
+}
+
+// Leader returns the leader variable for subsumed var n.
+func (mls *MergeLocalsState) Leader(n *ir.Name) *ir.Name {
+       if sl, ok := mls.partition[n]; ok {
+               if mls.vars[sl[0]] == n {
+                       panic("variable is not subsumed")
+               }
+               return mls.vars[sl[0]]
+       }
+       panic("not a merge candidate")
+}
+
+// Followers writes a list of the followers for leader n into the slice tmp.
+func (mls *MergeLocalsState) Followers(n *ir.Name, tmp []*ir.Name) []*ir.Name {
+       tmp = tmp[:0]
+       sl, ok := mls.partition[n]
+       if !ok {
+               panic("no entry for leader")
+       }
+       if mls.vars[sl[0]] != n {
+               panic("followers invoked on subsumed var")
+       }
+       for _, k := range sl[1:] {
+               tmp = append(tmp, mls.vars[k])
+       }
+       sort.SliceStable(tmp, func(i, j int) bool {
+               return tmp[i].Sym().Name < tmp[j].Sym().Name
+       })
+       return tmp
+}
+
+// EstSavings returns the estimated reduction in stack size for
+// the given merge locals state.
+func (mls *MergeLocalsState) EstSavings() int {
+       tot := 0
+       for n := range mls.partition {
+               if mls.Subsumed(n) {
+                       tot += int(n.Type().Size())
+               }
+       }
+       return tot
+}
+
+// check tests for various inconsistencies and problems in mls,
+// returning an error if any problems are found.
+func (mls *MergeLocalsState) check() error {
+       if mls == nil {
+               return nil
+       }
+       used := make(map[int]bool)
+       seenv := make(map[*ir.Name]int)
+       for ii, v := range mls.vars {
+               if prev, ok := seenv[v]; ok {
+                       return fmt.Errorf("duplicate var %q in vslots: %d and %d\n",
+                               v.Sym().Name, ii, prev)
+               }
+               seenv[v] = ii
+       }
+       for k, sl := range mls.partition {
+               // length of slice value needs to be more than 1
+               if len(sl) < 2 {
+                       return fmt.Errorf("k=%q v=%+v slice len %d invalid",
+                               k.Sym().Name, sl, len(sl))
+               }
+               // values in the slice need to be var indices
+               for i, v := range sl {
+                       if v < 0 || v > len(mls.vars)-1 {
+                               return fmt.Errorf("k=%q v=+%v slpos %d vslot %d out of range of m.v", k.Sym().Name, sl, i, v)
+                       }
+               }
+       }
+       for k, sl := range mls.partition {
+               foundk := false
+               for i, v := range sl {
+                       vv := mls.vars[v]
+                       if i == 0 {
+                               if !mls.IsLeader(vv) {
+                                       return fmt.Errorf("k=%s v=+%v slpos 0 vslot %d IsLeader(%q) is false should be true", k.Sym().Name, sl, v, vv.Sym().Name)
+                               }
+                       } else {
+                               if !mls.Subsumed(vv) {
+                                       return fmt.Errorf("k=%s v=+%v slpos %d vslot %d Subsumed(%q) is false should be true", k.Sym().Name, sl, i, v, vv.Sym().Name)
+                               }
+                               if mls.Leader(vv) != mls.vars[sl[0]] {
+                                       return fmt.Errorf("k=%s v=+%v slpos %d vslot %d Leader(%q) got %v want %v", k.Sym().Name, sl, i, v, vv.Sym().Name, mls.Leader(vv), mls.vars[sl[0]])
+                               }
+                       }
+                       if vv == k {
+                               foundk = true
+                               if used[v] {
+                                       return fmt.Errorf("k=%s v=+%v val slice used violation at slpos %d vslot %d", k.Sym().Name, sl, i, v)
+                               }
+                               used[v] = true
+                       }
+               }
+               if !foundk {
+                       return fmt.Errorf("k=%s v=+%v slice value missing k", k.Sym().Name, sl)
+               }
+       }
+       for i := range used {
+               if !used[i] {
+                       return fmt.Errorf("pos %d var %q unused", i, mls.vars[i])
+               }
+       }
+       return nil
+}
+
+func (mls *MergeLocalsState) String() string {
+       var leaders []*ir.Name
+       for n, sl := range mls.partition {
+               if n == mls.vars[sl[0]] {
+                       leaders = append(leaders, n)
+               }
+       }
+       sort.Slice(leaders, func(i, j int) bool {
+               return leaders[i].Sym().Name < leaders[j].Sym().Name
+       })
+       var sb strings.Builder
+       for _, n := range leaders {
+               sb.WriteString(n.Sym().Name + ":")
+               sl := mls.partition[n]
+               for _, k := range sl[1:] {
+                       n := mls.vars[k]
+                       sb.WriteString(" " + n.Sym().Name)
+               }
+               sb.WriteString("\n")
+       }
+       return sb.String()
+}
+
+// collectMergeCandidates visits all of the AUTO vars declared in
+// function fn and returns a list of candidate variables for merging /
+// overlapping. Return values are: 1) a slice of ir.Name's
+// corresponding to the candidates, 2) a map that maps ir.Name to slot
+// in the slice, and 3) a slice containing regions (start/end pairs)
+// corresponding to variables that could be overlapped provided that
+// their lifetimes are disjoint.
+func collectMergeCandidates(fn *ir.Func) ([]*ir.Name, map[*ir.Name]int32, []candRegion) {
+       m := make(map[*ir.Name]int32)
+       var cands []*ir.Name
+       var regions []candRegion
+
+       // Collect up the available set of appropriate AUTOs in the
+       // function as a first step.
+       for _, n := range fn.Dcl {
+               if !n.Used() {
+                       continue
+               }
+               if !ssa.IsMergeCandidate(n) {
+                       continue
+               }
+               cands = append(cands, n)
+       }
+       if len(cands) < 2 {
+               return nil, nil, nil
+       }
+
+       // Sort by pointerness, size, and then name.
+       sort.SliceStable(cands, func(i, j int) bool {
+               ci, cj := cands[i], cands[j]
+               ihp, jhp := 0, 0
+               var ilsym, jlsym *obj.LSym
+               if ci.Type().HasPointers() {
+                       ihp = 1
+                       ilsym, _, _ = reflectdata.GCSym(ci.Type())
+               }
+               if cj.Type().HasPointers() {
+                       jhp = 1
+                       jlsym, _, _ = reflectdata.GCSym(cj.Type())
+               }
+               if ihp != jhp {
+                       return ihp < jhp
+               }
+               if ci.Type().Size() != cj.Type().Size() {
+                       return ci.Type().Size() < cj.Type().Size()
+               }
+               if ihp != 0 && jhp != 0 && ilsym != jlsym {
+                       // FIXME: find less clunky way to do this
+                       return fmt.Sprintf("%v", ilsym) < fmt.Sprintf("%v", jlsym)
+               }
+               if ci.Sym().Name != cj.Sym().Name {
+                       return ci.Sym().Name < cj.Sym().Name
+               }
+               return fmt.Sprintf("%v", ci.Pos()) < fmt.Sprintf("%v", ci.Pos())
+       })
+
+       if base.Debug.MergeLocalsTrace > 1 {
+               fmt.Fprintf(os.Stderr, "=-= raw cand list for func %v:\n", fn)
+               for i := range cands {
+                       dumpCand(cands[i], i)
+               }
+       }
+
+       // Now generate a pruned candidate list-- we only want to return a
+       // non-empty list if there is some possibility of overlapping two
+       // vars.
+       var pruned []*ir.Name
+       st := 0
+       for {
+               en := nextRegion(cands, st)
+               if en == -1 {
+                       break
+               }
+               if st == en {
+                       // region has just one element, we can skip it
+                       st++
+                       continue
+               }
+               pst := len(pruned)
+               pen := pst + (en - st)
+               if base.Debug.MergeLocalsTrace > 1 {
+                       fmt.Fprintf(os.Stderr, "=-= add part %d -> %d\n", pst, pen)
+               }
+
+               // non-empty region, add to pruned
+               pruned = append(pruned, cands[st:en+1]...)
+               regions = append(regions, candRegion{st: pst, en: pen})
+               st = en + 1
+       }
+       if len(pruned) < 2 {
+               return nil, nil, nil
+       }
+       for i, n := range pruned {
+               m[n] = int32(i)
+       }
+
+       if base.Debug.MergeLocalsTrace > 1 {
+               fmt.Fprintf(os.Stderr, "=-= pruned candidate list for func %v:\n", fn)
+               for i := range pruned {
+                       dumpCand(pruned[i], i)
+               }
+       }
+       return pruned, m, regions
+}
+
+// nextRegion starts at location idx and walks forward in the cands
+// slice looking for variables that are "compatible" (overlappable)
+// with the variable at position idx; it returns the end of the new
+// region (range of compatible variables starting at idx).
+func nextRegion(cands []*ir.Name, idx int) int {
+       n := len(cands)
+       if idx >= n {
+               return -1
+       }
+       c0 := cands[idx]
+       hp0 := c0.Type().HasPointers()
+       for j := idx + 1; j < n; j++ {
+               cj := cands[j]
+               hpj := cj.Type().HasPointers()
+               ok := true
+               if hp0 {
+                       if !hpj || c0.Type().Size() != cj.Type().Size() {
+                               return j - 1
+                       }
+                       // GC shape must match if both types have pointers.
+                       gcsym0, _, _ := reflectdata.GCSym(c0.Type())
+                       gcsymj, _, _ := reflectdata.GCSym(cj.Type())
+                       if gcsym0 != gcsymj {
+                               return j - 1
+                       }
+               } else {
+                       // If no pointers, match size only.
+                       if !ok || hp0 != hpj || c0.Type().Size() != cj.Type().Size() {
+                               return j - 1
+                       }
+               }
+       }
+       return n - 1
+}
+
+type cstate struct {
+       fn        *ir.Func
+       ibuilders []IntervalsBuilder
+}
+
+// mergeVisitRegion tries to perform overlapping of variables with a
+// given subrange of cands described by st and en (indices into our
+// candidate var list), where the variables within this range have
+// already been determined to be compatible with respect to type,
+// size, etc. Overlapping is done in a a greedy fashion: we select the
+// first element in the st->en range, then walk the rest of the
+// elements adding in vars whose lifetimes don't overlap with the
+// first element, then repeat the process until we run out of work to do.
+func (mls *MergeLocalsState) mergeVisitRegion(lv *liveness, ivs []Intervals, st, en int) {
+       if base.Debug.MergeLocalsTrace > 1 {
+               fmt.Fprintf(os.Stderr, "=-= mergeVisitRegion(st=%d, en=%d)\n", st, en)
+       }
+       n := en - st + 1
+       used := bitvec.New(int32(n))
+
+       nxt := func(slot int) int {
+               for c := slot - st; c < n; c++ {
+                       if used.Get(int32(c)) {
+                               continue
+                       }
+                       return c + st
+               }
+               return -1
+       }
+
+       navail := n
+       cands := lv.vars
+       if base.Debug.MergeLocalsTrace > 1 {
+               fmt.Fprintf(os.Stderr, "  =-= navail = %d\n", navail)
+       }
+       for navail >= 2 {
+               leader := nxt(st)
+               used.Set(int32(leader - st))
+               navail--
+
+               if base.Debug.MergeLocalsTrace > 1 {
+                       fmt.Fprintf(os.Stderr, "  =-= begin leader %d used=%s\n", leader,
+                               used.String())
+               }
+               elems := []int{leader}
+               lints := ivs[leader]
+
+               for succ := nxt(leader + 1); succ != -1; succ = nxt(succ + 1) {
+
+                       // Skip if de-selected by merge locals hash.
+                       if base.Debug.MergeLocalsHash != "" {
+                               if !base.MergeLocalsHash.MatchPosWithInfo(cands[succ].Pos(), "mergelocals", nil) {
+                                       continue
+                               }
+                       }
+                       // Skip if already used.
+                       if used.Get(int32(succ - st)) {
+                               continue
+                       }
+                       if base.Debug.MergeLocalsTrace > 1 {
+                               fmt.Fprintf(os.Stderr, "  =-= overlap of %d[%v] {%s} with %d[%v] {%s} is: %v\n", leader, cands[leader], lints.String(), succ, cands[succ], ivs[succ].String(), lints.Overlaps(ivs[succ]))
+                       }
+
+                       // Can we overlap leader with this var?
+                       if lints.Overlaps(ivs[succ]) {
+                               continue
+                       } else {
+                               // Add to overlap set.
+                               elems = append(elems, succ)
+                               lints = lints.Merge(ivs[succ])
+                       }
+               }
+               if len(elems) > 1 {
+                       // We found some things to overlap with leader. Add the
+                       // candidate elements to "vars" and update "partition".
+                       off := len(mls.vars)
+                       sl := make([]int, len(elems))
+                       for i, candslot := range elems {
+                               sl[i] = off + i
+                               mls.vars = append(mls.vars, cands[candslot])
+                               mls.partition[cands[candslot]] = sl
+                       }
+                       navail -= (len(elems) - 1)
+                       for i := range elems {
+                               used.Set(int32(elems[i] - st))
+                       }
+                       if base.Debug.MergeLocalsTrace > 1 {
+                               fmt.Fprintf(os.Stderr, "=-= overlapping %+v:\n", sl)
+                               for i := range sl {
+                                       dumpCand(mls.vars[sl[i]], sl[i])
+                               }
+                               for i, v := range elems {
+                                       fmt.Fprintf(os.Stderr, "=-= %d: sl=%d %s\n", i, v, ivs[v])
+                               }
+                       }
+               }
+       }
+}
+
+// performMerging carries out variable merging within each of the
+// candidate ranges in regions, returning a state object
+// that describes the variable overlaps.
+func performMerging(lv *liveness, cs *cstate, regions []candRegion) *MergeLocalsState {
+       cands := lv.vars
+       mls := &MergeLocalsState{
+               partition: make(map[*ir.Name][]int),
+       }
+
+       // Finish intervals construction.
+       ivs := make([]Intervals, len(cands))
+       for i := range cands {
+               var err error
+               ivs[i], err = cs.ibuilders[i].Finish()
+               if err != nil {
+                       ninstr := 0
+                       if base.Debug.MergeLocalsTrace != 0 {
+                               iidx := 0
+                               for k := 0; k < len(lv.f.Blocks); k++ {
+                                       b := lv.f.Blocks[k]
+                                       fmt.Fprintf(os.Stderr, "\n")
+                                       for _, v := range b.Values {
+                                               fmt.Fprintf(os.Stderr, " b%d %d: %s\n", k, iidx, v.LongString())
+                                               iidx++
+                                               ninstr++
+                                       }
+                               }
+                       }
+                       base.FatalfAt(cands[i].Pos(), "interval construct error for var %q in func %q (%d instrs): %v", cands[i].Sym().Name, ir.FuncName(cs.fn), ninstr, err)
+                       return nil
+               }
+       }
+
+       // Dump state before attempting overlap.
+       if base.Debug.MergeLocalsTrace > 1 {
+               fmt.Fprintf(os.Stderr, "=-= cands live before overlap:\n")
+               for i := range cands {
+                       c := cands[i]
+                       fmt.Fprintf(os.Stderr, "%d: %v sz=%d ivs=%s\n",
+                               i, c.Sym().Name, c.Type().Size(), ivs[i].String())
+               }
+               fmt.Fprintf(os.Stderr, "=-= regions (%d): ", len(regions))
+               for _, cr := range regions {
+                       fmt.Fprintf(os.Stderr, " [%d,%d]", cr.st, cr.en)
+               }
+               fmt.Fprintf(os.Stderr, "\n")
+       }
+
+       if base.Debug.MergeLocalsTrace > 1 {
+               fmt.Fprintf(os.Stderr, "=-= len(regions) = %d\n", len(regions))
+       }
+
+       // Apply a greedy merge/overlap strategy within each region
+       // of compatible variables.
+       for _, cr := range regions {
+               mls.mergeVisitRegion(lv, ivs, cr.st, cr.en)
+       }
+       if len(mls.vars) == 0 {
+               return nil
+       }
+       return mls
+}
+
+// computeIntervals performs a backwards sweep over the instructions
+// of the function we're compiling, building up an Intervals object
+// for each candidate variable by looking for upwards exposed uses
+// and kills.
+func computeIntervals(lv *liveness, cs *cstate) {
+       nvars := int32(len(lv.vars))
+       liveout := bitvec.New(nvars)
+
+       if base.Debug.MergeLocalsDumpFunc != "" &&
+               strings.HasSuffix(fmt.Sprintf("%v", cs.fn), base.Debug.MergeLocalsDumpFunc) {
+               fmt.Fprintf(os.Stderr, "=-= mergelocalsdumpfunc %v:\n", cs.fn)
+               ii := 0
+               for k, b := range lv.f.Blocks {
+                       fmt.Fprintf(os.Stderr, "b%d:\n", k)
+                       for _, v := range b.Values {
+                               pos := base.Ctxt.PosTable.Pos(v.Pos)
+                               fmt.Fprintf(os.Stderr, "=-= %d L%d|C%d %s\n", ii, pos.RelLine(), pos.RelCol(), v.LongString())
+                               ii++
+                       }
+               }
+       }
+
+       // Count instructions.
+       ninstr := 0
+       for _, b := range lv.f.Blocks {
+               ninstr += len(b.Values)
+       }
+       // current instruction index during backwards walk
+       iidx := ninstr - 1
+
+       // Make a backwards pass over all blocks
+       for k := len(lv.f.Blocks) - 1; k >= 0; k-- {
+               b := lv.f.Blocks[k]
+               be := lv.blockEffects(b)
+
+               if base.Debug.MergeLocalsTrace > 2 {
+                       fmt.Fprintf(os.Stderr, "=-= liveout from tail of b%d: ", k)
+                       for j := range lv.vars {
+                               if be.liveout.Get(int32(j)) {
+                                       fmt.Fprintf(os.Stderr, " %q", lv.vars[j].Sym().Name)
+                               }
+                       }
+                       fmt.Fprintf(os.Stderr, "\n")
+               }
+
+               // Take into account effects taking place at end of this basic
+               // block by comparing our current live set with liveout for
+               // the block. If a given var was not live before and is now
+               // becoming live we need to mark this transition with a
+               // builder "Live" call; similarly if a var was live before and
+               // is now no longer live, we need a "Kill" call.
+               for j := range lv.vars {
+                       isLive := liveout.Get(int32(j))
+                       blockLiveOut := be.liveout.Get(int32(j))
+                       if isLive {
+                               if !blockLiveOut {
+                                       if base.Debug.MergeLocalsTrace > 2 {
+                                               fmt.Fprintf(os.Stderr, "=+= at instr %d block boundary kill of %v\n", iidx, lv.vars[j])
+                                       }
+                                       cs.ibuilders[j].Kill(iidx)
+                               }
+                       } else if blockLiveOut {
+                               if base.Debug.MergeLocalsTrace > 2 {
+                                       fmt.Fprintf(os.Stderr, "=+= at block-end instr %d %v becomes live\n",
+                                               iidx, lv.vars[j])
+                               }
+                               cs.ibuilders[j].Live(iidx)
+                       }
+               }
+
+               // Set our working "currently live" set to the previously
+               // computed live out set for the block.
+               liveout.Copy(be.liveout)
+
+               // Now walk backwards through this block.
+               for i := len(b.Values) - 1; i >= 0; i-- {
+                       v := b.Values[i]
+
+                       if base.Debug.MergeLocalsTrace > 2 {
+                               fmt.Fprintf(os.Stderr, "=-= b%d instr %d: %s\n", k, iidx, v.LongString())
+                       }
+
+                       // Update liveness based on what we see happening in this
+                       // instruction.
+                       pos, e := lv.valueEffects(v)
+                       becomeslive := e&uevar != 0
+                       iskilled := e&varkill != 0
+                       if becomeslive && iskilled {
+                               // we do not ever expect to see both a kill and an
+                               // upwards exposed use given our size constraints.
+                               panic("should never happen")
+                       }
+                       if iskilled && liveout.Get(pos) {
+                               cs.ibuilders[pos].Kill(iidx)
+                               liveout.Unset(pos)
+                               if base.Debug.MergeLocalsTrace > 2 {
+                                       fmt.Fprintf(os.Stderr, "=+= at instr %d kill of %v\n",
+                                               iidx, lv.vars[pos])
+                               }
+                       } else if becomeslive && !liveout.Get(pos) {
+                               cs.ibuilders[pos].Live(iidx)
+                               liveout.Set(pos)
+                               if base.Debug.MergeLocalsTrace > 2 {
+                                       fmt.Fprintf(os.Stderr, "=+= at instr %d upwards-exposed use of %v\n",
+                                               iidx, lv.vars[pos])
+                               }
+                       }
+                       iidx--
+               }
+
+               if b == lv.f.Entry {
+                       for j, v := range lv.vars {
+                               if liveout.Get(int32(j)) {
+                                       lv.f.Fatalf("%v %L recorded as live on entry",
+                                               lv.fn.Nname, v)
+                               }
+                       }
+               }
+       }
+       if iidx != -1 {
+               panic("iidx underflow")
+       }
+}
+
+func dumpCand(c *ir.Name, i int) {
+       fmtFullPos := func(p src.XPos) string {
+               var sb strings.Builder
+               sep := ""
+               base.Ctxt.AllPos(p, func(pos src.Pos) {
+                       fmt.Fprintf(&sb, sep)
+                       sep = "|"
+                       file := filepath.Base(pos.Filename())
+                       fmt.Fprintf(&sb, "%s:%d:%d", file, pos.Line(), pos.Col())
+               })
+               return sb.String()
+       }
+       fmt.Fprintf(os.Stderr, " %d: %s %q sz=%d hp=%v t=%v\n",
+               i, fmtFullPos(c.Pos()), c.Sym().Name, c.Type().Size(),
+               c.Type().HasPointers(), c.Type())
+}
+
+// for unit testing only.
+func MakeMergeLocalsState(partition map[*ir.Name][]int, vars []*ir.Name) (*MergeLocalsState, error) {
+       mls := &MergeLocalsState{partition: partition, vars: vars}
+       if err := mls.check(); err != nil {
+               return nil, err
+       }
+       return mls, nil
+}
index e4dbfa9fa31e783896be77371295ad5269de1fb4..ab1a7df93030becb99cfbfd8d4ee5d6f81a926ca 100644 (file)
@@ -143,6 +143,11 @@ type liveness struct {
 
        doClobber     bool // Whether to clobber dead stack slots in this function.
        noClobberArgs bool // Do not clobber function arguments
+
+       // treat "dead" writes as equivalent to reads during the analysis;
+       // used only during liveness analysis for stack slot merging (doesn't
+       // make sense for stackmap analysis).
+       conservativeWrites bool
 }
 
 // Map maps from *ssa.Value to StackMapIndex.
@@ -312,8 +317,12 @@ func (lv *liveness) valueEffects(v *ssa.Value) (int32, liveEffect) {
        if e&(ssa.SymRead|ssa.SymAddr) != 0 {
                effect |= uevar
        }
-       if e&ssa.SymWrite != 0 && (!isfat(n.Type()) || v.Op == ssa.OpVarDef) {
-               effect |= varkill
+       if e&ssa.SymWrite != 0 {
+               if !isfat(n.Type()) || v.Op == ssa.OpVarDef {
+                       effect |= varkill
+               } else if lv.conservativeWrites {
+                       effect |= uevar
+               }
        }
 
        if effect == 0 {
@@ -450,6 +459,11 @@ func (lv *liveness) blockEffects(b *ssa.Block) *blockEffects {
 // this argument and the in arguments are always assumed live. The vars
 // argument is a slice of *Nodes.
 func (lv *liveness) pointerMap(liveout bitvec.BitVec, vars []*ir.Name, args, locals bitvec.BitVec) {
+       var slotsSeen map[int64]*ir.Name
+       checkForDuplicateSlots := base.Debug.MergeLocals != 0
+       if checkForDuplicateSlots {
+               slotsSeen = make(map[int64]*ir.Name)
+       }
        for i := int32(0); ; i++ {
                i = liveout.Next(i)
                if i < 0 {
@@ -468,6 +482,12 @@ func (lv *liveness) pointerMap(liveout bitvec.BitVec, vars []*ir.Name, args, loc
                        fallthrough // PPARAMOUT in registers acts memory-allocates like an AUTO
                case ir.PAUTO:
                        typebits.Set(node.Type(), node.FrameOffset()+lv.stkptrsize, locals)
+                       if checkForDuplicateSlots {
+                               if prev, ok := slotsSeen[node.FrameOffset()]; ok {
+                                       base.FatalfAt(node.Pos(), "two vars live at pointerMap generation: %q and %q", prev.Sym().Name, node.Sym().Name)
+                               }
+                               slotsSeen[node.FrameOffset()] = node
+                       }
                }
        }
 }
index bbfdaceaad90b0f966ce63587eaa0ff0b63dccc1..cb6788cd952c4e4b2c5ac34612ee029d63de1cb7 100644 (file)
@@ -314,8 +314,9 @@ func checkFunc(f *Func) {
                                        f.Fatalf("bad arg 1 type to %s: want integer, have %s", v.Op, v.Args[1].LongString())
                                }
                        case OpVarDef:
-                               if !v.Aux.(*ir.Name).Type().HasPointers() {
-                                       f.Fatalf("vardef must have pointer type %s", v.Aux.(*ir.Name).Type().String())
+                               n := v.Aux.(*ir.Name)
+                               if !n.Type().HasPointers() && !IsMergeCandidate(n) {
+                                       f.Fatalf("vardef must be merge candidate or have pointer type %s", v.Aux.(*ir.Name).Type().String())
                                }
                        case OpNilCheck:
                                // nil checks have pointer type before scheduling, and
index 031d94f90cf1ffad38136b37d19135087ef0e5b2..38b459a2ff3a6cac91fccd4e393873f108f80a63 100644 (file)
@@ -838,5 +838,25 @@ func (f *Func) useFMA(v *Value) bool {
 
 // NewLocal returns a new anonymous local variable of the given type.
 func (f *Func) NewLocal(pos src.XPos, typ *types.Type) *ir.Name {
-       return typecheck.TempAt(pos, f.fe.Func(), typ) // Note: adds new auto to fn.Dcl list
+       nn := typecheck.TempAt(pos, f.fe.Func(), typ) // Note: adds new auto to fn.Dcl list
+       nn.SetNonMergeable(true)
+       return nn
+}
+
+// IsMergeCandidate returns true if variable n could participate in
+// stack slot merging. For now we're restricting the set to things to
+// items larger than what CanSSA would allow (approximateky, we disallow things
+// marked as open defer slots so as to avoid complicating liveness
+// analysis.
+func IsMergeCandidate(n *ir.Name) bool {
+       if base.Debug.MergeLocals == 0 ||
+               base.Flag.N != 0 ||
+               n.Class != ir.PAUTO ||
+               n.Type().Size() <= int64(3*types.PtrSize) ||
+               n.Addrtaken() ||
+               n.NonMergeable() ||
+               n.OpenDeferSlot() {
+               return false
+       }
+       return true
 }
index c3d9ec30919cb2ae33e2f0fd71a9501c44b13336..d0045e7ee3280214446dc568ca2387f77a779d71 100644 (file)
@@ -13,6 +13,7 @@ import (
 
        "cmd/compile/internal/base"
        "cmd/compile/internal/ir"
+       "cmd/compile/internal/liveness"
        "cmd/compile/internal/objw"
        "cmd/compile/internal/ssa"
        "cmd/compile/internal/types"
@@ -151,6 +152,18 @@ func (s *ssafn) AllocFrame(f *ssa.Func) {
                }
        }
 
+       var mls *liveness.MergeLocalsState
+       if base.Debug.MergeLocals != 0 {
+               mls = liveness.MergeLocals(fn, f)
+               if base.Debug.MergeLocalsTrace == 1 && mls != nil {
+                       fmt.Fprintf(os.Stderr, "%s: %d bytes of stack space saved via stack slot merging\n", ir.FuncName(fn), mls.EstSavings())
+                       if base.Debug.MergeLocalsTrace > 1 {
+                               fmt.Fprintf(os.Stderr, "=-= merge locals state for %v:\n%v",
+                                       fn, mls)
+                       }
+               }
+       }
+
        // Use sort.SliceStable instead of sort.Slice so stack layout (and thus
        // compiler output) is less sensitive to frontend changes that
        // introduce or remove unused variables.
@@ -158,6 +171,22 @@ func (s *ssafn) AllocFrame(f *ssa.Func) {
                return cmpstackvarlt(fn.Dcl[i], fn.Dcl[j])
        })
 
+       if base.Debug.MergeLocalsTrace > 1 && mls != nil {
+               fmt.Fprintf(os.Stderr, "=-= sorted DCL for %v:\n", fn)
+               for i, v := range fn.Dcl {
+                       if !ssa.IsMergeCandidate(v) {
+                               continue
+                       }
+                       fmt.Fprintf(os.Stderr, " %d: %q isleader=%v subsumed=%v used=%v\n", i, v.Sym().Name, mls.IsLeader(v), mls.Subsumed(v), v.Used())
+
+               }
+       }
+
+       var leaders map[*ir.Name]int64
+       if mls != nil {
+               leaders = make(map[*ir.Name]int64)
+       }
+
        // Reassign stack offsets of the locals that are used.
        lastHasPtr := false
        for i, n := range fn.Dcl {
@@ -165,12 +194,14 @@ func (s *ssafn) AllocFrame(f *ssa.Func) {
                        // i.e., stack assign if AUTO, or if PARAMOUT in registers (which has no predefined spill locations)
                        continue
                }
+               if mls != nil && mls.Subsumed(n) {
+                       continue
+               }
                if !n.Used() {
                        fn.DebugInfo.(*ssa.FuncDebug).OptDcl = fn.Dcl[i:]
                        fn.Dcl = fn.Dcl[:i]
                        break
                }
-
                types.CalcSize(n.Type())
                w := n.Type().Size()
                if w >= types.MaxWidth || w < 0 {
@@ -195,6 +226,42 @@ func (s *ssafn) AllocFrame(f *ssa.Func) {
                        lastHasPtr = false
                }
                n.SetFrameOffset(-s.stksize)
+               if mls != nil && mls.IsLeader(n) {
+                       leaders[n] = -s.stksize
+               }
+       }
+
+       if mls != nil {
+               followers := []*ir.Name{}
+               newdcl := make([]*ir.Name, 0, len(fn.Dcl))
+               for i := 0; i < len(fn.Dcl); i++ {
+                       n := fn.Dcl[i]
+                       if mls.Subsumed(n) {
+                               continue
+                       }
+                       newdcl = append(newdcl, n)
+                       if off, ok := leaders[n]; ok {
+                               followers = mls.Followers(n, followers)
+                               for _, f := range followers {
+                                       // Set the stack offset for each follower to be
+                                       // the same as the leader.
+                                       f.SetFrameOffset(off)
+                               }
+                               // position followers immediately after leader
+                               newdcl = append(newdcl, followers...)
+                       }
+               }
+               fn.Dcl = newdcl
+       }
+
+       if base.Debug.MergeLocalsTrace > 1 {
+               fmt.Fprintf(os.Stderr, "=-= stack layout for %v:\n", fn)
+               for i, v := range fn.Dcl {
+                       if v.Op() != ir.ONAME || (v.Class != ir.PAUTO && !(v.Class == ir.PPARAMOUT && v.IsOutputParamInRegisters())) {
+                               continue
+                       }
+                       fmt.Fprintf(os.Stderr, " %d: %q frameoff %d used=%v\n", i, v.Sym().Name, v.FrameOffset(), v.Used())
+               }
        }
 
        s.stksize = types.RoundUp(s.stksize, s.stkalign)
index 59b4c8808921f9fb06c50b216aa2fe25f1846e11..9e384fe01666ca701c87c9885f2b98c08203ab57 100644 (file)
@@ -633,7 +633,7 @@ func (s *state) zeroResults() {
                if typ := n.Type(); ssa.CanSSA(typ) {
                        s.assign(n, s.zeroVal(typ), false, 0)
                } else {
-                       if typ.HasPointers() {
+                       if typ.HasPointers() || ssa.IsMergeCandidate(n) {
                                s.vars[memVar] = s.newValue1A(ssa.OpVarDef, types.TypeMem, n, s.mem())
                        }
                        s.zero(n.Type(), s.decladdrs[n])
@@ -3949,7 +3949,7 @@ func (s *state) assignWhichMayOverlap(left ir.Node, right *ssa.Value, deref bool
 
        // If this assignment clobbers an entire local variable, then emit
        // OpVarDef so liveness analysis knows the variable is redefined.
-       if base, ok := clobberBase(left).(*ir.Name); ok && base.OnStack() && skip == 0 && t.HasPointers() {
+       if base, ok := clobberBase(left).(*ir.Name); ok && base.OnStack() && skip == 0 && (t.HasPointers() || ssa.IsMergeCandidate(base)) {
                s.vars[memVar] = s.newValue1Apos(ssa.OpVarDef, types.TypeMem, base, s.mem(), !ir.IsAutoTmp(base))
        }
 
@@ -5389,7 +5389,8 @@ func (s *state) call(n *ir.CallExpr, k callKind, returnResultAddr bool, deferExt
                }
                // Make a defer struct on the stack.
                t := deferstruct()
-               _, addr := s.temp(n.Pos(), t)
+               n, addr := s.temp(n.Pos(), t)
+               n.SetNonMergeable(true)
                s.store(closure.Type,
                        s.newValue1I(ssa.OpOffPtr, closure.Type.PtrTo(), t.FieldOff(deferStructFnField), addr),
                        closure)
@@ -6893,7 +6894,7 @@ func (s *state) dottype1(pos src.XPos, src, dst *types.Type, iface, source, targ
 // temp allocates a temp of type t at position pos
 func (s *state) temp(pos src.XPos, t *types.Type) (*ir.Name, *ssa.Value) {
        tmp := typecheck.TempAt(pos, s.curfn, t)
-       if t.HasPointers() {
+       if t.HasPointers() || (ssa.IsMergeCandidate(tmp) && t != deferstruct()) {
                s.vars[memVar] = s.newValue1A(ssa.OpVarDef, types.TypeMem, tmp, s.mem())
        }
        addr := s.addr(tmp)
diff --git a/src/cmd/compile/internal/test/mergelocals_test.go b/src/cmd/compile/internal/test/mergelocals_test.go
new file mode 100644 (file)
index 0000000..f070197
--- /dev/null
@@ -0,0 +1,184 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package test
+
+import (
+       "cmd/compile/internal/ir"
+       "cmd/compile/internal/liveness"
+       "cmd/compile/internal/typecheck"
+       "cmd/compile/internal/types"
+       "cmd/internal/src"
+       "internal/testenv"
+       "path/filepath"
+       "slices"
+       "sort"
+       "strings"
+       "testing"
+)
+
+func TestMergeLocalState(t *testing.T) {
+       mkiv := func(name string) *ir.Name {
+               i32 := types.Types[types.TINT32]
+               s := typecheck.Lookup(name)
+               v := ir.NewNameAt(src.NoXPos, s, i32)
+               return v
+       }
+       v1 := mkiv("v1")
+       v2 := mkiv("v2")
+       v3 := mkiv("v3")
+
+       testcases := []struct {
+               vars      []*ir.Name
+               partition map[*ir.Name][]int
+               experr    bool
+       }{
+               {
+                       vars: []*ir.Name{v1, v2, v3},
+                       partition: map[*ir.Name][]int{
+                               v1: []int{0, 1, 2},
+                               v2: []int{0, 1, 2},
+                               v3: []int{0, 1, 2},
+                       },
+                       experr: false,
+               },
+               {
+                       // invalid mls.v slot -1
+                       vars: []*ir.Name{v1, v2, v3},
+                       partition: map[*ir.Name][]int{
+                               v1: []int{-1, 0},
+                               v2: []int{0, 1, 2},
+                               v3: []int{0, 1, 2},
+                       },
+                       experr: true,
+               },
+               {
+                       // duplicate var in v
+                       vars: []*ir.Name{v1, v2, v2},
+                       partition: map[*ir.Name][]int{
+                               v1: []int{0, 1, 2},
+                               v2: []int{0, 1, 2},
+                               v3: []int{0, 1, 2},
+                       },
+                       experr: true,
+               },
+               {
+                       // single element in partition
+                       vars: []*ir.Name{v1, v2, v3},
+                       partition: map[*ir.Name][]int{
+                               v1: []int{0},
+                               v2: []int{0, 1, 2},
+                               v3: []int{0, 1, 2},
+                       },
+                       experr: true,
+               },
+               {
+                       // missing element 2
+                       vars: []*ir.Name{v1, v2, v3},
+                       partition: map[*ir.Name][]int{
+                               v1: []int{0, 1},
+                               v2: []int{0, 1},
+                               v3: []int{0, 1},
+                       },
+                       experr: true,
+               },
+               {
+                       // partitions disagree for v1 vs v2
+                       vars: []*ir.Name{v1, v2, v3},
+                       partition: map[*ir.Name][]int{
+                               v1: []int{0, 1, 2},
+                               v2: []int{1, 0, 2},
+                               v3: []int{0, 1, 2},
+                       },
+                       experr: true,
+               },
+       }
+
+       for k, testcase := range testcases {
+               mls, err := liveness.MakeMergeLocalsState(testcase.partition, testcase.vars)
+               t.Logf("tc %d err is %v\n", k, err)
+               if testcase.experr && err == nil {
+                       t.Fatalf("tc:%d missing error mls %v", k, mls)
+               } else if !testcase.experr && err != nil {
+                       t.Fatalf("tc:%d unexpected error mls %v", k, err)
+               }
+               if mls != nil {
+                       t.Logf("tc %d: mls: %v\n", k, mls.String())
+               }
+       }
+}
+
+func TestMergeLocalsIntegration(t *testing.T) {
+       testenv.MustHaveGoBuild(t)
+
+       // This test does a build of a specific canned package to
+       // check whether merging of stack slots is taking place.
+       // The idea is to do the compile with a trace option turned
+       // on and then pick up on the frame offsets of specific
+       // variables.
+       //
+       // Stack slot merging is a greedy algorithm, and there can
+       // be many possible ways to overlap a given set of candidate
+       // variables, all of them legal. Rather than locking down
+       // a specific set of overlappings or frame offsets, this
+       // tests just verifies that there is one clump of 3 vars that
+       // get overlapped, then another clump of 2 that share the same
+       // frame offset.
+       //
+       // The expected output blob we're interested in looks like this:
+       //
+       // =-= stack layout for ABC:
+       //  2: "p1" frameoff -8200 used=true
+       //  3: "xp3" frameoff -8200 used=true
+       //  4: "xp4" frameoff -8200 used=true
+       //  5: "p2" frameoff -16400 used=true
+       //  6: "s" frameoff -24592 used=true
+       //  7: "v1" frameoff -32792 used=true
+       //  8: "v3" frameoff -32792 used=true
+       //  9: "v2" frameoff -40992 used=true
+       //
+       tmpdir := t.TempDir()
+       src := filepath.Join("testdata", "mergelocals", "integration.go")
+       obj := filepath.Join(tmpdir, "p.a")
+       out, err := testenv.Command(t, testenv.GoToolPath(t), "tool", "compile", "-p=p", "-c", "1", "-o", obj, "-d=mergelocalstrace=2,mergelocals=1", src).CombinedOutput()
+       if err != nil {
+               t.Fatalf("failed to compile: %v\n%s", err, out)
+       }
+       vars := make(map[string]string)
+       lines := strings.Split(string(out), "\n")
+       prolog := true
+       varsAtFrameOffset := make(map[string]int)
+       for _, line := range lines {
+               if line == "=-= stack layout for ABC:" {
+                       prolog = false
+                       continue
+               } else if prolog || line == "" {
+                       continue
+               }
+               fields := strings.Fields(line)
+               if len(fields) != 5 {
+                       t.Fatalf("bad trace output line: %s", line)
+               }
+               vname := fields[1]
+               frameoff := fields[3]
+               varsAtFrameOffset[frameoff] = varsAtFrameOffset[frameoff] + 1
+               vars[vname] = frameoff
+       }
+       wantvnum := 8
+       gotvnum := len(vars)
+       if wantvnum != gotvnum {
+               t.Fatalf("expected trace output on %d vars got %d\n", wantvnum, gotvnum)
+       }
+
+       // We expect one clump of 3, another clump of 2, and the rest singletons.
+       expected := []int{1, 1, 1, 2, 3}
+       got := []int{}
+       for _, v := range varsAtFrameOffset {
+               got = append(got, v)
+       }
+       sort.Ints(got)
+       if !slices.Equal(got, expected) {
+               t.Fatalf("expected variable clumps %+v not equal to what we got: %+v", expected, got)
+       }
+}
diff --git a/src/cmd/compile/internal/test/testdata/mergelocals/integration.go b/src/cmd/compile/internal/test/testdata/mergelocals/integration.go
new file mode 100644 (file)
index 0000000..d640c6f
--- /dev/null
@@ -0,0 +1,83 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package p
+
+// This type and the following one will share the same GC shape and size.
+type Pointery struct {
+       p *Pointery
+       x [1024]int
+}
+
+type Pointery2 struct {
+       p *Pointery2
+       x [1024]int
+}
+
+// This type and the following one will have the same size.
+type Vanilla struct {
+       np uintptr
+       x  [1024]int
+}
+
+type Vanilla2 struct {
+       np uintptr
+       x  [1023]int
+       y  int
+}
+
+type Single struct {
+       np uintptr
+       x  [1023]int
+}
+
+func ABC(i, j int) int {
+       r := 0
+
+       // here v1 interferes with v2 but could be overlapped with v3.
+       // we can also overlap v1 with v3.
+       var v1 Vanilla
+       if i < 101 {
+               var v2 Vanilla
+               v1.x[i] = j
+               r += v1.x[j]
+               v2.x[i] = j
+               r += v2.x[j]
+       }
+
+       {
+               var v3 Vanilla2
+               v3.x[i] = j
+               r += v3.x[j]
+       }
+
+       var s Single
+       s.x[i] = j
+       r += s.x[j]
+
+       // Here p1 and p2 interfere, but p1 could be overlapped with xp3.
+       var p1, p2 Pointery
+       p1.x[i] = j
+       r += p1.x[j]
+       p2.x[i] = j
+       r += p2.x[j]
+       {
+               var xp3 Pointery2
+               xp3.x[i] = j
+               r += xp3.x[j]
+       }
+
+       if i == j*2 {
+               // p2 live on this path
+               p2.x[i] += j
+               r += p2.x[j]
+       } else {
+               // p2 not live on this path
+               var xp4 Pointery2
+               xp4.x[i] = j
+               r += xp4.x[j]
+       }
+
+       return r
+}
index 886b5beec3e6712e24c3b6fbe3384db1b0f2d722..604ac173672cc945d9cf088a691232fdb552b6cb 100644 (file)
@@ -25,7 +25,9 @@ func initStackTemp(init *ir.Nodes, tmp *ir.Name, val ir.Node) *ir.AddrExpr {
 // allocated temporary variable of the given type. Statements to
 // zero-initialize tmp are appended to init.
 func stackTempAddr(init *ir.Nodes, typ *types.Type) *ir.AddrExpr {
-       return initStackTemp(init, typecheck.TempAt(base.Pos, ir.CurFunc, typ), nil)
+       n := typecheck.TempAt(base.Pos, ir.CurFunc, typ)
+       n.SetNonMergeable(true)
+       return initStackTemp(init, n, nil)
 }
 
 // stackBufAddr returns the expression &tmp, where tmp is a newly
index 3240960f1ad35a072a45dee8e3f09db8d3ab07af..deba9c9faecf9f165f41bfa9433f0bf319ea73da 100644 (file)
 
 package main
 
-var z [10<<20]byte
+var z [10 << 20]byte
 
 func main() { // GC_ERROR "stack frame too large"
-       // seq 1 206 | sed 's/.*/       var x& [10<<20]byte; z = x&/'
-       var x1 [10<<20]byte; z = x1
-       var x2 [10<<20]byte; z = x2
-       var x3 [10<<20]byte; z = x3
-       var x4 [10<<20]byte; z = x4
-       var x5 [10<<20]byte; z = x5
-       var x6 [10<<20]byte; z = x6
-       var x7 [10<<20]byte; z = x7
-       var x8 [10<<20]byte; z = x8
-       var x9 [10<<20]byte; z = x9
-       var x10 [10<<20]byte; z = x10
-       var x11 [10<<20]byte; z = x11
-       var x12 [10<<20]byte; z = x12
-       var x13 [10<<20]byte; z = x13
-       var x14 [10<<20]byte; z = x14
-       var x15 [10<<20]byte; z = x15
-       var x16 [10<<20]byte; z = x16
-       var x17 [10<<20]byte; z = x17
-       var x18 [10<<20]byte; z = x18
-       var x19 [10<<20]byte; z = x19
-       var x20 [10<<20]byte; z = x20
-       var x21 [10<<20]byte; z = x21
-       var x22 [10<<20]byte; z = x22
-       var x23 [10<<20]byte; z = x23
-       var x24 [10<<20]byte; z = x24
-       var x25 [10<<20]byte; z = x25
-       var x26 [10<<20]byte; z = x26
-       var x27 [10<<20]byte; z = x27
-       var x28 [10<<20]byte; z = x28
-       var x29 [10<<20]byte; z = x29
-       var x30 [10<<20]byte; z = x30
-       var x31 [10<<20]byte; z = x31
-       var x32 [10<<20]byte; z = x32
-       var x33 [10<<20]byte; z = x33
-       var x34 [10<<20]byte; z = x34
-       var x35 [10<<20]byte; z = x35
-       var x36 [10<<20]byte; z = x36
-       var x37 [10<<20]byte; z = x37
-       var x38 [10<<20]byte; z = x38
-       var x39 [10<<20]byte; z = x39
-       var x40 [10<<20]byte; z = x40
-       var x41 [10<<20]byte; z = x41
-       var x42 [10<<20]byte; z = x42
-       var x43 [10<<20]byte; z = x43
-       var x44 [10<<20]byte; z = x44
-       var x45 [10<<20]byte; z = x45
-       var x46 [10<<20]byte; z = x46
-       var x47 [10<<20]byte; z = x47
-       var x48 [10<<20]byte; z = x48
-       var x49 [10<<20]byte; z = x49
-       var x50 [10<<20]byte; z = x50
-       var x51 [10<<20]byte; z = x51
-       var x52 [10<<20]byte; z = x52
-       var x53 [10<<20]byte; z = x53
-       var x54 [10<<20]byte; z = x54
-       var x55 [10<<20]byte; z = x55
-       var x56 [10<<20]byte; z = x56
-       var x57 [10<<20]byte; z = x57
-       var x58 [10<<20]byte; z = x58
-       var x59 [10<<20]byte; z = x59
-       var x60 [10<<20]byte; z = x60
-       var x61 [10<<20]byte; z = x61
-       var x62 [10<<20]byte; z = x62
-       var x63 [10<<20]byte; z = x63
-       var x64 [10<<20]byte; z = x64
-       var x65 [10<<20]byte; z = x65
-       var x66 [10<<20]byte; z = x66
-       var x67 [10<<20]byte; z = x67
-       var x68 [10<<20]byte; z = x68
-       var x69 [10<<20]byte; z = x69
-       var x70 [10<<20]byte; z = x70
-       var x71 [10<<20]byte; z = x71
-       var x72 [10<<20]byte; z = x72
-       var x73 [10<<20]byte; z = x73
-       var x74 [10<<20]byte; z = x74
-       var x75 [10<<20]byte; z = x75
-       var x76 [10<<20]byte; z = x76
-       var x77 [10<<20]byte; z = x77
-       var x78 [10<<20]byte; z = x78
-       var x79 [10<<20]byte; z = x79
-       var x80 [10<<20]byte; z = x80
-       var x81 [10<<20]byte; z = x81
-       var x82 [10<<20]byte; z = x82
-       var x83 [10<<20]byte; z = x83
-       var x84 [10<<20]byte; z = x84
-       var x85 [10<<20]byte; z = x85
-       var x86 [10<<20]byte; z = x86
-       var x87 [10<<20]byte; z = x87
-       var x88 [10<<20]byte; z = x88
-       var x89 [10<<20]byte; z = x89
-       var x90 [10<<20]byte; z = x90
-       var x91 [10<<20]byte; z = x91
-       var x92 [10<<20]byte; z = x92
-       var x93 [10<<20]byte; z = x93
-       var x94 [10<<20]byte; z = x94
-       var x95 [10<<20]byte; z = x95
-       var x96 [10<<20]byte; z = x96
-       var x97 [10<<20]byte; z = x97
-       var x98 [10<<20]byte; z = x98
-       var x99 [10<<20]byte; z = x99
-       var x100 [10<<20]byte; z = x100
-       var x101 [10<<20]byte; z = x101
-       var x102 [10<<20]byte; z = x102
-       var x103 [10<<20]byte; z = x103
-       var x104 [10<<20]byte; z = x104
-       var x105 [10<<20]byte; z = x105
-       var x106 [10<<20]byte; z = x106
-       var x107 [10<<20]byte; z = x107
-       var x108 [10<<20]byte; z = x108
-       var x109 [10<<20]byte; z = x109
-       var x110 [10<<20]byte; z = x110
-       var x111 [10<<20]byte; z = x111
-       var x112 [10<<20]byte; z = x112
-       var x113 [10<<20]byte; z = x113
-       var x114 [10<<20]byte; z = x114
-       var x115 [10<<20]byte; z = x115
-       var x116 [10<<20]byte; z = x116
-       var x117 [10<<20]byte; z = x117
-       var x118 [10<<20]byte; z = x118
-       var x119 [10<<20]byte; z = x119
-       var x120 [10<<20]byte; z = x120
-       var x121 [10<<20]byte; z = x121
-       var x122 [10<<20]byte; z = x122
-       var x123 [10<<20]byte; z = x123
-       var x124 [10<<20]byte; z = x124
-       var x125 [10<<20]byte; z = x125
-       var x126 [10<<20]byte; z = x126
-       var x127 [10<<20]byte; z = x127
-       var x128 [10<<20]byte; z = x128
-       var x129 [10<<20]byte; z = x129
-       var x130 [10<<20]byte; z = x130
-       var x131 [10<<20]byte; z = x131
-       var x132 [10<<20]byte; z = x132
-       var x133 [10<<20]byte; z = x133
-       var x134 [10<<20]byte; z = x134
-       var x135 [10<<20]byte; z = x135
-       var x136 [10<<20]byte; z = x136
-       var x137 [10<<20]byte; z = x137
-       var x138 [10<<20]byte; z = x138
-       var x139 [10<<20]byte; z = x139
-       var x140 [10<<20]byte; z = x140
-       var x141 [10<<20]byte; z = x141
-       var x142 [10<<20]byte; z = x142
-       var x143 [10<<20]byte; z = x143
-       var x144 [10<<20]byte; z = x144
-       var x145 [10<<20]byte; z = x145
-       var x146 [10<<20]byte; z = x146
-       var x147 [10<<20]byte; z = x147
-       var x148 [10<<20]byte; z = x148
-       var x149 [10<<20]byte; z = x149
-       var x150 [10<<20]byte; z = x150
-       var x151 [10<<20]byte; z = x151
-       var x152 [10<<20]byte; z = x152
-       var x153 [10<<20]byte; z = x153
-       var x154 [10<<20]byte; z = x154
-       var x155 [10<<20]byte; z = x155
-       var x156 [10<<20]byte; z = x156
-       var x157 [10<<20]byte; z = x157
-       var x158 [10<<20]byte; z = x158
-       var x159 [10<<20]byte; z = x159
-       var x160 [10<<20]byte; z = x160
-       var x161 [10<<20]byte; z = x161
-       var x162 [10<<20]byte; z = x162
-       var x163 [10<<20]byte; z = x163
-       var x164 [10<<20]byte; z = x164
-       var x165 [10<<20]byte; z = x165
-       var x166 [10<<20]byte; z = x166
-       var x167 [10<<20]byte; z = x167
-       var x168 [10<<20]byte; z = x168
-       var x169 [10<<20]byte; z = x169
-       var x170 [10<<20]byte; z = x170
-       var x171 [10<<20]byte; z = x171
-       var x172 [10<<20]byte; z = x172
-       var x173 [10<<20]byte; z = x173
-       var x174 [10<<20]byte; z = x174
-       var x175 [10<<20]byte; z = x175
-       var x176 [10<<20]byte; z = x176
-       var x177 [10<<20]byte; z = x177
-       var x178 [10<<20]byte; z = x178
-       var x179 [10<<20]byte; z = x179
-       var x180 [10<<20]byte; z = x180
-       var x181 [10<<20]byte; z = x181
-       var x182 [10<<20]byte; z = x182
-       var x183 [10<<20]byte; z = x183
-       var x184 [10<<20]byte; z = x184
-       var x185 [10<<20]byte; z = x185
-       var x186 [10<<20]byte; z = x186
-       var x187 [10<<20]byte; z = x187
-       var x188 [10<<20]byte; z = x188
-       var x189 [10<<20]byte; z = x189
-       var x190 [10<<20]byte; z = x190
-       var x191 [10<<20]byte; z = x191
-       var x192 [10<<20]byte; z = x192
-       var x193 [10<<20]byte; z = x193
-       var x194 [10<<20]byte; z = x194
-       var x195 [10<<20]byte; z = x195
-       var x196 [10<<20]byte; z = x196
-       var x197 [10<<20]byte; z = x197
-       var x198 [10<<20]byte; z = x198
-       var x199 [10<<20]byte; z = x199
-       var x200 [10<<20]byte; z = x200
-       var x201 [10<<20]byte; z = x201
-       var x202 [10<<20]byte; z = x202
-       var x203 [10<<20]byte; z = x203
-       var x204 [10<<20]byte; z = x204
-       var x205 [10<<20]byte; z = x205
-       var x206 [10<<20]byte; z = x206
+       // seq 1 206 | sed 's/.*/       var x& [10<<20]byte/'
+       // seq 1 206 | sed 's/.*/       z = x&/'
+       var x1 [10<<20]byte
+       var x2 [10<<20]byte
+       var x3 [10<<20]byte
+       var x4 [10<<20]byte
+       var x5 [10<<20]byte
+       var x6 [10<<20]byte
+       var x7 [10<<20]byte
+       var x8 [10<<20]byte
+       var x9 [10<<20]byte
+       var x10 [10<<20]byte
+       var x11 [10<<20]byte
+       var x12 [10<<20]byte
+       var x13 [10<<20]byte
+       var x14 [10<<20]byte
+       var x15 [10<<20]byte
+       var x16 [10<<20]byte
+       var x17 [10<<20]byte
+       var x18 [10<<20]byte
+       var x19 [10<<20]byte
+       var x20 [10<<20]byte
+       var x21 [10<<20]byte
+       var x22 [10<<20]byte
+       var x23 [10<<20]byte
+       var x24 [10<<20]byte
+       var x25 [10<<20]byte
+       var x26 [10<<20]byte
+       var x27 [10<<20]byte
+       var x28 [10<<20]byte
+       var x29 [10<<20]byte
+       var x30 [10<<20]byte
+       var x31 [10<<20]byte
+       var x32 [10<<20]byte
+       var x33 [10<<20]byte
+       var x34 [10<<20]byte
+       var x35 [10<<20]byte
+       var x36 [10<<20]byte
+       var x37 [10<<20]byte
+       var x38 [10<<20]byte
+       var x39 [10<<20]byte
+       var x40 [10<<20]byte
+       var x41 [10<<20]byte
+       var x42 [10<<20]byte
+       var x43 [10<<20]byte
+       var x44 [10<<20]byte
+       var x45 [10<<20]byte
+       var x46 [10<<20]byte
+       var x47 [10<<20]byte
+       var x48 [10<<20]byte
+       var x49 [10<<20]byte
+       var x50 [10<<20]byte
+       var x51 [10<<20]byte
+       var x52 [10<<20]byte
+       var x53 [10<<20]byte
+       var x54 [10<<20]byte
+       var x55 [10<<20]byte
+       var x56 [10<<20]byte
+       var x57 [10<<20]byte
+       var x58 [10<<20]byte
+       var x59 [10<<20]byte
+       var x60 [10<<20]byte
+       var x61 [10<<20]byte
+       var x62 [10<<20]byte
+       var x63 [10<<20]byte
+       var x64 [10<<20]byte
+       var x65 [10<<20]byte
+       var x66 [10<<20]byte
+       var x67 [10<<20]byte
+       var x68 [10<<20]byte
+       var x69 [10<<20]byte
+       var x70 [10<<20]byte
+       var x71 [10<<20]byte
+       var x72 [10<<20]byte
+       var x73 [10<<20]byte
+       var x74 [10<<20]byte
+       var x75 [10<<20]byte
+       var x76 [10<<20]byte
+       var x77 [10<<20]byte
+       var x78 [10<<20]byte
+       var x79 [10<<20]byte
+       var x80 [10<<20]byte
+       var x81 [10<<20]byte
+       var x82 [10<<20]byte
+       var x83 [10<<20]byte
+       var x84 [10<<20]byte
+       var x85 [10<<20]byte
+       var x86 [10<<20]byte
+       var x87 [10<<20]byte
+       var x88 [10<<20]byte
+       var x89 [10<<20]byte
+       var x90 [10<<20]byte
+       var x91 [10<<20]byte
+       var x92 [10<<20]byte
+       var x93 [10<<20]byte
+       var x94 [10<<20]byte
+       var x95 [10<<20]byte
+       var x96 [10<<20]byte
+       var x97 [10<<20]byte
+       var x98 [10<<20]byte
+       var x99 [10<<20]byte
+       var x100 [10<<20]byte
+       var x101 [10<<20]byte
+       var x102 [10<<20]byte
+       var x103 [10<<20]byte
+       var x104 [10<<20]byte
+       var x105 [10<<20]byte
+       var x106 [10<<20]byte
+       var x107 [10<<20]byte
+       var x108 [10<<20]byte
+       var x109 [10<<20]byte
+       var x110 [10<<20]byte
+       var x111 [10<<20]byte
+       var x112 [10<<20]byte
+       var x113 [10<<20]byte
+       var x114 [10<<20]byte
+       var x115 [10<<20]byte
+       var x116 [10<<20]byte
+       var x117 [10<<20]byte
+       var x118 [10<<20]byte
+       var x119 [10<<20]byte
+       var x120 [10<<20]byte
+       var x121 [10<<20]byte
+       var x122 [10<<20]byte
+       var x123 [10<<20]byte
+       var x124 [10<<20]byte
+       var x125 [10<<20]byte
+       var x126 [10<<20]byte
+       var x127 [10<<20]byte
+       var x128 [10<<20]byte
+       var x129 [10<<20]byte
+       var x130 [10<<20]byte
+       var x131 [10<<20]byte
+       var x132 [10<<20]byte
+       var x133 [10<<20]byte
+       var x134 [10<<20]byte
+       var x135 [10<<20]byte
+       var x136 [10<<20]byte
+       var x137 [10<<20]byte
+       var x138 [10<<20]byte
+       var x139 [10<<20]byte
+       var x140 [10<<20]byte
+       var x141 [10<<20]byte
+       var x142 [10<<20]byte
+       var x143 [10<<20]byte
+       var x144 [10<<20]byte
+       var x145 [10<<20]byte
+       var x146 [10<<20]byte
+       var x147 [10<<20]byte
+       var x148 [10<<20]byte
+       var x149 [10<<20]byte
+       var x150 [10<<20]byte
+       var x151 [10<<20]byte
+       var x152 [10<<20]byte
+       var x153 [10<<20]byte
+       var x154 [10<<20]byte
+       var x155 [10<<20]byte
+       var x156 [10<<20]byte
+       var x157 [10<<20]byte
+       var x158 [10<<20]byte
+       var x159 [10<<20]byte
+       var x160 [10<<20]byte
+       var x161 [10<<20]byte
+       var x162 [10<<20]byte
+       var x163 [10<<20]byte
+       var x164 [10<<20]byte
+       var x165 [10<<20]byte
+       var x166 [10<<20]byte
+       var x167 [10<<20]byte
+       var x168 [10<<20]byte
+       var x169 [10<<20]byte
+       var x170 [10<<20]byte
+       var x171 [10<<20]byte
+       var x172 [10<<20]byte
+       var x173 [10<<20]byte
+       var x174 [10<<20]byte
+       var x175 [10<<20]byte
+       var x176 [10<<20]byte
+       var x177 [10<<20]byte
+       var x178 [10<<20]byte
+       var x179 [10<<20]byte
+       var x180 [10<<20]byte
+       var x181 [10<<20]byte
+       var x182 [10<<20]byte
+       var x183 [10<<20]byte
+       var x184 [10<<20]byte
+       var x185 [10<<20]byte
+       var x186 [10<<20]byte
+       var x187 [10<<20]byte
+       var x188 [10<<20]byte
+       var x189 [10<<20]byte
+       var x190 [10<<20]byte
+       var x191 [10<<20]byte
+       var x192 [10<<20]byte
+       var x193 [10<<20]byte
+       var x194 [10<<20]byte
+       var x195 [10<<20]byte
+       var x196 [10<<20]byte
+       var x197 [10<<20]byte
+       var x198 [10<<20]byte
+       var x199 [10<<20]byte
+       var x200 [10<<20]byte
+       var x201 [10<<20]byte
+       var x202 [10<<20]byte
+       var x203 [10<<20]byte
+       var x204 [10<<20]byte
+       var x205 [10<<20]byte
+       var x206 [10<<20]byte
+       var x207 [10<<20]byte
+       z = x1
+       z = x2
+       z = x3
+       z = x4
+       z = x5
+       z = x6
+       z = x7
+       z = x8
+       z = x9
+       z = x10
+       z = x11
+       z = x12
+       z = x13
+       z = x14
+       z = x15
+       z = x16
+       z = x17
+       z = x18
+       z = x19
+       z = x20
+       z = x21
+       z = x22
+       z = x23
+       z = x24
+       z = x25
+       z = x26
+       z = x27
+       z = x28
+       z = x29
+       z = x30
+       z = x31
+       z = x32
+       z = x33
+       z = x34
+       z = x35
+       z = x36
+       z = x37
+       z = x38
+       z = x39
+       z = x40
+       z = x41
+       z = x42
+       z = x43
+       z = x44
+       z = x45
+       z = x46
+       z = x47
+       z = x48
+       z = x49
+       z = x50
+       z = x51
+       z = x52
+       z = x53
+       z = x54
+       z = x55
+       z = x56
+       z = x57
+       z = x58
+       z = x59
+       z = x60
+       z = x61
+       z = x62
+       z = x63
+       z = x64
+       z = x65
+       z = x66
+       z = x67
+       z = x68
+       z = x69
+       z = x70
+       z = x71
+       z = x72
+       z = x73
+       z = x74
+       z = x75
+       z = x76
+       z = x77
+       z = x78
+       z = x79
+       z = x80
+       z = x81
+       z = x82
+       z = x83
+       z = x84
+       z = x85
+       z = x86
+       z = x87
+       z = x88
+       z = x89
+       z = x90
+       z = x91
+       z = x92
+       z = x93
+       z = x94
+       z = x95
+       z = x96
+       z = x97
+       z = x98
+       z = x99
+       z = x100
+       z = x101
+       z = x102
+       z = x103
+       z = x104
+       z = x105
+       z = x106
+       z = x107
+       z = x108
+       z = x109
+       z = x110
+       z = x111
+       z = x112
+       z = x113
+       z = x114
+       z = x115
+       z = x116
+       z = x117
+       z = x118
+       z = x119
+       z = x120
+       z = x121
+       z = x122
+       z = x123
+       z = x124
+       z = x125
+       z = x126
+       z = x127
+       z = x128
+       z = x129
+       z = x130
+       z = x131
+       z = x132
+       z = x133
+       z = x134
+       z = x135
+       z = x136
+       z = x137
+       z = x138
+       z = x139
+       z = x140
+       z = x141
+       z = x142
+       z = x143
+       z = x144
+       z = x145
+       z = x146
+       z = x147
+       z = x148
+       z = x149
+       z = x150
+       z = x151
+       z = x152
+       z = x153
+       z = x154
+       z = x155
+       z = x156
+       z = x157
+       z = x158
+       z = x159
+       z = x160
+       z = x161
+       z = x162
+       z = x163
+       z = x164
+       z = x165
+       z = x166
+       z = x167
+       z = x168
+       z = x169
+       z = x170
+       z = x171
+       z = x172
+       z = x173
+       z = x174
+       z = x175
+       z = x176
+       z = x177
+       z = x178
+       z = x179
+       z = x180
+       z = x181
+       z = x182
+       z = x183
+       z = x184
+       z = x185
+       z = x186
+       z = x187
+       z = x188
+       z = x189
+       z = x190
+       z = x191
+       z = x192
+       z = x193
+       z = x194
+       z = x195
+       z = x196
+       z = x197
+       z = x198
+       z = x199
+       z = x200
+       z = x201
+       z = x202
+       z = x203
+       z = x204
+       z = x205
+       z = x206
+       z = x207
 }