--- /dev/null
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+ "cmd/compile/internal/ir"
+ "cmd/compile/internal/types"
+ "slices"
+)
+
+// The pair pass finds memory operations that can be paired up
+// into single 2-register memory instructions.
+func pair(f *Func) {
+ // Only arm64 for now. This pass is fairly arch-specific.
+ switch f.Config.arch {
+ case "arm64":
+ default:
+ return
+ }
+ pairLoads(f)
+ pairStores(f)
+}
+
+type pairableLoadInfo struct {
+ width int64 // width of one element in the pair, in bytes
+ pair Op
+}
+
+// All pairableLoad ops must take 2 arguments, a pointer and a memory.
+// They must also take an offset in Aux/AuxInt.
+var pairableLoads = map[Op]pairableLoadInfo{
+ OpARM64MOVDload: {8, OpARM64LDP},
+ OpARM64MOVWload: {4, OpARM64LDPW},
+ OpARM64FMOVDload: {8, OpARM64FLDPD},
+ OpARM64FMOVSload: {4, OpARM64FLDPS},
+}
+
+type pairableStoreInfo struct {
+ width int64 // width of one element in the pair, in bytes
+ pair Op
+}
+
+// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
+// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
+// They must also take an offset in Aux/AuxInt.
+var pairableStores = map[Op]pairableStoreInfo{
+ OpARM64MOVDstore: {8, OpARM64STP},
+ OpARM64MOVWstore: {4, OpARM64STPW},
+ OpARM64FMOVDstore: {8, OpARM64FSTPD},
+ OpARM64FMOVSstore: {4, OpARM64FSTPS},
+ // TODO: storezero variants.
+}
+
+// offsetOk returns true if a pair instruction should be used
+// for the offset Aux+off, when the data width (of the
+// unpaired instructions) is width.
+// This function is best-effort. The compiled function must
+// still work if offsetOk always returns true.
+// TODO: this is currently arm64-specific.
+func offsetOk(aux Aux, off, width int64) bool {
+ if true {
+ // Seems to generate slightly smaller code if we just
+ // always allow this rewrite.
+ //
+ // Without pairing, we have 2 load instructions, like:
+ // LDR 88(R0), R1
+ // LDR 96(R0), R2
+ // with pairing we have, best case:
+ // LDP 88(R0), R1, R2
+ // but maybe we need an adjuster if out of range or unaligned:
+ // ADD R0, $88, R27
+ // LDP (R27), R1, R2
+ // Even with the adjuster, it is at least no worse.
+ //
+ // A similar situation occurs when accessing globals.
+ // Two loads from globals requires 4 instructions,
+ // two ADRP and two LDR. With pairing, we need
+ // ADRP+ADD+LDP, three instructions.
+ //
+ // With pairing, it looks like the critical path might
+ // be a little bit longer. But it should never be more
+ // instructions.
+ // TODO: see if that longer critical path causes any
+ // regressions.
+ return true
+ }
+ if aux != nil {
+ if _, ok := aux.(*ir.Name); !ok {
+ // Offset is probably too big (globals).
+ return false
+ }
+ // We let *ir.Names pass here, as
+ // they are probably small offsets from SP.
+ // There's no guarantee that we're in range
+ // in that case though (we don't know the
+ // stack frame size yet), so the assembler
+ // might need to issue fixup instructions.
+ // Assume some small frame size.
+ if off >= 0 {
+ off += 120
+ }
+ // TODO: figure out how often this helps vs. hurts.
+ }
+ switch width {
+ case 4:
+ if off >= -256 && off <= 252 && off%4 == 0 {
+ return true
+ }
+ case 8:
+ if off >= -512 && off <= 504 && off%8 == 0 {
+ return true
+ }
+ }
+ return false
+}
+
+func pairLoads(f *Func) {
+ var loads []*Value
+
+ // Registry of aux values for sorting.
+ auxIDs := map[Aux]int{}
+ auxID := func(aux Aux) int {
+ id, ok := auxIDs[aux]
+ if !ok {
+ id = len(auxIDs)
+ auxIDs[aux] = id
+ }
+ return id
+ }
+
+ for _, b := range f.Blocks {
+ // Find loads.
+ loads = loads[:0]
+ clear(auxIDs)
+ for _, v := range b.Values {
+ info := pairableLoads[v.Op]
+ if info.width == 0 {
+ continue // not pairable
+ }
+ if !offsetOk(v.Aux, v.AuxInt, info.width) {
+ continue // not advisable
+ }
+ loads = append(loads, v)
+ }
+ if len(loads) < 2 {
+ continue
+ }
+
+ // Sort to put pairable loads together.
+ slices.SortFunc(loads, func(x, y *Value) int {
+ // First sort by op, ptr, and memory arg.
+ if x.Op != y.Op {
+ return int(x.Op - y.Op)
+ }
+ if x.Args[0].ID != y.Args[0].ID {
+ return int(x.Args[0].ID - y.Args[0].ID)
+ }
+ if x.Args[1].ID != y.Args[1].ID {
+ return int(x.Args[1].ID - y.Args[1].ID)
+ }
+ // Then sort by aux. (nil first, then by aux ID)
+ if x.Aux != nil {
+ if y.Aux == nil {
+ return 1
+ }
+ a, b := auxID(x.Aux), auxID(y.Aux)
+ if a != b {
+ return a - b
+ }
+ } else if y.Aux != nil {
+ return -1
+ }
+ // Then sort by offset, low to high.
+ return int(x.AuxInt - y.AuxInt)
+ })
+
+ // Look for pairable loads.
+ for i := 0; i < len(loads)-1; i++ {
+ x := loads[i]
+ y := loads[i+1]
+ if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
+ continue
+ }
+ if x.Aux != y.Aux {
+ continue
+ }
+ if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
+ continue
+ }
+
+ // Commit point.
+
+ // Make the 2-register load.
+ load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
+
+ // Modify x to be (Select0 load). Similar for y.
+ x.reset(OpSelect0)
+ x.SetArgs1(load)
+ y.reset(OpSelect1)
+ y.SetArgs1(load)
+
+ i++ // Skip y next time around the loop.
+ }
+ }
+}
+
+func pairStores(f *Func) {
+ last := f.Cache.allocBoolSlice(f.NumValues())
+ defer f.Cache.freeBoolSlice(last)
+
+ // prevStore returns the previous store in the
+ // same block, or nil if there are none.
+ prevStore := func(v *Value) *Value {
+ if v.Op == OpInitMem || v.Op == OpPhi {
+ return nil
+ }
+ m := v.MemoryArg()
+ if m.Block != v.Block {
+ return nil
+ }
+ return m
+ }
+
+ for _, b := range f.Blocks {
+ // Find last store in block, so we can
+ // walk the stores last to first.
+ // Last to first helps ensure that the rewrites we
+ // perform do not get in the way of subsequent rewrites.
+ for _, v := range b.Values {
+ if v.Type.IsMemory() {
+ last[v.ID] = true
+ }
+ }
+ for _, v := range b.Values {
+ if v.Type.IsMemory() {
+ if m := prevStore(v); m != nil {
+ last[m.ID] = false
+ }
+ }
+ }
+ var lastMem *Value
+ for _, v := range b.Values {
+ if last[v.ID] {
+ lastMem = v
+ break
+ }
+ }
+
+ // Check all stores, from last to first.
+ memCheck:
+ for v := lastMem; v != nil; v = prevStore(v) {
+ info := pairableStores[v.Op]
+ if info.width == 0 {
+ continue // Not pairable.
+ }
+ if !offsetOk(v.Aux, v.AuxInt, info.width) {
+ continue // Not advisable to pair.
+ }
+ ptr := v.Args[0]
+ val := v.Args[1]
+ mem := v.Args[2]
+ off := v.AuxInt
+ aux := v.Aux
+
+ // Look for earlier store we can combine with.
+ lowerOk := true
+ higherOk := true
+ count := 10 // max lookback distance
+ for w := prevStore(v); w != nil; w = prevStore(w) {
+ if w.Uses != 1 {
+ // We can't combine stores if the earlier
+ // store has any use besides the next one
+ // in the store chain.
+ // (Unless we could check the aliasing of
+ // all those other uses.)
+ continue memCheck
+ }
+ if w.Op == v.Op &&
+ w.Args[0] == ptr &&
+ w.Aux == aux &&
+ (lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) {
+ // This op is mergeable with v.
+
+ // Commit point.
+
+ // ptr val1 val2 mem
+ args := []*Value{ptr, val, w.Args[1], mem}
+ if w.AuxInt == off-info.width {
+ args[1], args[2] = args[2], args[1]
+ off -= info.width
+ }
+ v.reset(info.pair)
+ v.AddArgs(args...)
+ v.Aux = aux
+ v.AuxInt = off
+ v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
+
+ // Make w just a memory copy.
+ wmem := w.MemoryArg()
+ w.reset(OpCopy)
+ w.SetArgs1(wmem)
+ continue memCheck
+ }
+ if count--; count == 0 {
+ // Only look back so far.
+ // This keeps us in O(n) territory, and it
+ // also prevents us from keeping values
+ // in registers for too long (and thus
+ // needing to spill them).
+ continue memCheck
+ }
+ // We're now looking at a store w which is currently
+ // between the store v that we're intending to merge into,
+ // and the store we'll eventually find to merge with it.
+ // Make sure this store doesn't alias with the one
+ // we'll be moving.
+ var width int64
+ switch w.Op {
+ case OpARM64MOVDstore, OpARM64MOVDstorezero, OpARM64FMOVDstore:
+ width = 8
+ case OpARM64MOVWstore, OpARM64MOVWstorezero, OpARM64FMOVSstore:
+ width = 4
+ case OpARM64MOVHstore, OpARM64MOVHstorezero:
+ width = 2
+ case OpARM64MOVBstore, OpARM64MOVBstorezero:
+ width = 1
+ case OpCopy:
+ continue // this was a store we merged earlier
+ default:
+ // Can't reorder with any other memory operations.
+ // (atomics, calls, ...)
+ continue memCheck
+ }
+
+ // We only allow reordering with respect to other
+ // writes to the same pointer and aux, so we can
+ // compute the exact the aliasing relationship.
+ if w.Args[0] != ptr || w.Aux != aux {
+ continue memCheck
+ }
+ if overlap(w.AuxInt, width, off-info.width, info.width) {
+ // Aliases with slot before v's location.
+ lowerOk = false
+ }
+ if overlap(w.AuxInt, width, off+info.width, info.width) {
+ // Aliases with slot after v's location.
+ higherOk = false
+ }
+ if !higherOk && !lowerOk {
+ continue memCheck
+ }
+ }
+ }
+ }
+}
p.b = uint32(x >> 32)
}
func store32be(p *struct{ a, b uint32 }, x uint64) {
+ // arm64:"STPW"
// ppc64:"MOVD",-"MOVW",-"SRD"
// s390x:"MOVD",-"MOVW",-"SRD"
p.a = uint32(x >> 32)
+ // arm64:-"STPW"
// ppc64:-"MOVW",-"SRD"
// s390x:-"MOVW",-"SRD"
p.b = uint32(x)
b[0] = byte(v)
return b
}
+
+// --------------------------------- //
+// Arm64 double-register loads //
+// --------------------------------- //
+
+func dwloadI64(p *struct{ a, b int64 }) int64 {
+ // arm64:"LDP\t"
+ return p.a + p.b
+}
+func dwloadI32(p *struct{ a, b int32 }) int32 {
+ // arm64:"LDPW\t"
+ return p.a + p.b
+}
+func dwloadF64(p *struct{ a, b float64 }) float64 {
+ // arm64:"FLDPD\t"
+ return p.a + p.b
+}
+func dwloadF32(p *struct{ a, b float32 }) float32 {
+ // arm64:"FLDPS\t"
+ return p.a + p.b
+}
+
+func dwloadBig(p *struct{ a, b, c, d, e, f int64 }) int64 {
+ // arm64:"LDP\t\\(", "LDP\t16", "LDP\t32"
+ return p.c + p.f + p.a + p.e + p.d + p.b
+}
+
+func dwloadArg(a [2]int64) int64 {
+ // arm64:"LDP\t"
+ return a[0] + a[1]
+}
+
+// ---------------------------------- //
+// Arm64 double-register stores //
+// ---------------------------------- //
+
+func dwstoreI64(p *struct{ a, b int64 }, x, y int64) {
+ // arm64:"STP\t"
+ p.a = x
+ p.b = y
+}
+func dwstoreI32(p *struct{ a, b int32 }, x, y int32) {
+ // arm64:"STPW\t"
+ p.a = x
+ p.b = y
+}
+func dwstoreF64(p *struct{ a, b float64 }, x, y float64) {
+ // arm64:"FSTPD\t"
+ p.a = x
+ p.b = y
+}
+func dwstoreF32(p *struct{ a, b float32 }, x, y float32) {
+ // arm64:"FSTPS\t"
+ p.a = x
+ p.b = y
+}
+
+func dwstoreBig(p *struct{ a, b, c, d, e, f int64 }, a, b, c, d, e, f int64) {
+ // This is not perfect. We merge b+a, then d+e, then c and f have no pair.
+ p.c = c
+ p.f = f
+ // arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
+ p.a = a
+ // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 24\(R[0-9]+\)`
+ p.e = e
+ p.d = d
+ p.b = b
+}
+
+func dwstoreRet() [2]int {
+ // arm64:"STP\t"
+ return [2]int{5, 6}
+}
+
+func dwstoreLocal(i int) int64 {
+ var a [2]int64
+ a[0] = 5
+ // arm64:"STP\t"
+ a[1] = 6
+ return a[i]
+}
+
+func dwstoreOrder(p *struct {
+ a, b int64
+ c, d, e, f bool
+}, a, b int64) {
+ // arm64:"STP\t"
+ p.a = a
+ p.c = true
+ p.e = true
+ p.b = b
+}