]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: pair loads and stores on arm64
authorkhr@golang.org <khr@golang.org>
Sun, 17 Nov 2024 20:07:34 +0000 (12:07 -0800)
committerKeith Randall <khr@golang.org>
Thu, 13 Feb 2025 22:07:47 +0000 (14:07 -0800)
Look for possible paired load/store operations on arm64.
I don't expect this would be a lot faster, but it will save
binary space, and indirectly through the icache at least a bit
of time.

Change-Id: I4dd73b0e6329c4659b7453998f9b75320fcf380b
Reviewed-on: https://go-review.googlesource.com/c/go/+/629256
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
src/cmd/compile/internal/ssa/compile.go
src/cmd/compile/internal/ssa/pair.go [new file with mode: 0644]
test/codegen/memcombine.go
test/tighten.go

index 3f46599a3e5756cbc97421aa84c9d46234f26de2..634a6f68642e712112e39f99ca8fa38e3ba8709c 100644 (file)
@@ -488,6 +488,7 @@ var passes = [...]pass{
        {name: "lower", fn: lower, required: true},
        {name: "addressing modes", fn: addressingModes, required: false},
        {name: "late lower", fn: lateLower, required: true},
+       {name: "pair", fn: pair},
        {name: "lowered deadcode for cse", fn: deadcode}, // deadcode immediately before CSE avoids CSE making dead values live again
        {name: "lowered cse", fn: cse},
        {name: "elim unread autos", fn: elimUnreadAutos},
diff --git a/src/cmd/compile/internal/ssa/pair.go b/src/cmd/compile/internal/ssa/pair.go
new file mode 100644 (file)
index 0000000..5af9b0c
--- /dev/null
@@ -0,0 +1,357 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssa
+
+import (
+       "cmd/compile/internal/ir"
+       "cmd/compile/internal/types"
+       "slices"
+)
+
+// The pair pass finds memory operations that can be paired up
+// into single 2-register memory instructions.
+func pair(f *Func) {
+       // Only arm64 for now. This pass is fairly arch-specific.
+       switch f.Config.arch {
+       case "arm64":
+       default:
+               return
+       }
+       pairLoads(f)
+       pairStores(f)
+}
+
+type pairableLoadInfo struct {
+       width int64 // width of one element in the pair, in bytes
+       pair  Op
+}
+
+// All pairableLoad ops must take 2 arguments, a pointer and a memory.
+// They must also take an offset in Aux/AuxInt.
+var pairableLoads = map[Op]pairableLoadInfo{
+       OpARM64MOVDload:  {8, OpARM64LDP},
+       OpARM64MOVWload:  {4, OpARM64LDPW},
+       OpARM64FMOVDload: {8, OpARM64FLDPD},
+       OpARM64FMOVSload: {4, OpARM64FLDPS},
+}
+
+type pairableStoreInfo struct {
+       width int64 // width of one element in the pair, in bytes
+       pair  Op
+}
+
+// All pairableStore keys must take 3 arguments, a pointer, a value, and a memory.
+// All pairableStore values must take 4 arguments, a pointer, 2 values, and a memory.
+// They must also take an offset in Aux/AuxInt.
+var pairableStores = map[Op]pairableStoreInfo{
+       OpARM64MOVDstore:  {8, OpARM64STP},
+       OpARM64MOVWstore:  {4, OpARM64STPW},
+       OpARM64FMOVDstore: {8, OpARM64FSTPD},
+       OpARM64FMOVSstore: {4, OpARM64FSTPS},
+       // TODO: storezero variants.
+}
+
+// offsetOk returns true if a pair instruction should be used
+// for the offset Aux+off, when the data width (of the
+// unpaired instructions) is width.
+// This function is best-effort. The compiled function must
+// still work if offsetOk always returns true.
+// TODO: this is currently arm64-specific.
+func offsetOk(aux Aux, off, width int64) bool {
+       if true {
+               // Seems to generate slightly smaller code if we just
+               // always allow this rewrite.
+               //
+               // Without pairing, we have 2 load instructions, like:
+               //   LDR 88(R0), R1
+               //   LDR 96(R0), R2
+               // with pairing we have, best case:
+               //   LDP 88(R0), R1, R2
+               // but maybe we need an adjuster if out of range or unaligned:
+               //   ADD R0, $88, R27
+               //   LDP (R27), R1, R2
+               // Even with the adjuster, it is at least no worse.
+               //
+               // A similar situation occurs when accessing globals.
+               // Two loads from globals requires 4 instructions,
+               // two ADRP and two LDR. With pairing, we need
+               // ADRP+ADD+LDP, three instructions.
+               //
+               // With pairing, it looks like the critical path might
+               // be a little bit longer. But it should never be more
+               // instructions.
+               // TODO: see if that longer critical path causes any
+               // regressions.
+               return true
+       }
+       if aux != nil {
+               if _, ok := aux.(*ir.Name); !ok {
+                       // Offset is probably too big (globals).
+                       return false
+               }
+               // We let *ir.Names pass here, as
+               // they are probably small offsets from SP.
+               // There's no guarantee that we're in range
+               // in that case though (we don't know the
+               // stack frame size yet), so the assembler
+               // might need to issue fixup instructions.
+               // Assume some small frame size.
+               if off >= 0 {
+                       off += 120
+               }
+               // TODO: figure out how often this helps vs. hurts.
+       }
+       switch width {
+       case 4:
+               if off >= -256 && off <= 252 && off%4 == 0 {
+                       return true
+               }
+       case 8:
+               if off >= -512 && off <= 504 && off%8 == 0 {
+                       return true
+               }
+       }
+       return false
+}
+
+func pairLoads(f *Func) {
+       var loads []*Value
+
+       // Registry of aux values for sorting.
+       auxIDs := map[Aux]int{}
+       auxID := func(aux Aux) int {
+               id, ok := auxIDs[aux]
+               if !ok {
+                       id = len(auxIDs)
+                       auxIDs[aux] = id
+               }
+               return id
+       }
+
+       for _, b := range f.Blocks {
+               // Find loads.
+               loads = loads[:0]
+               clear(auxIDs)
+               for _, v := range b.Values {
+                       info := pairableLoads[v.Op]
+                       if info.width == 0 {
+                               continue // not pairable
+                       }
+                       if !offsetOk(v.Aux, v.AuxInt, info.width) {
+                               continue // not advisable
+                       }
+                       loads = append(loads, v)
+               }
+               if len(loads) < 2 {
+                       continue
+               }
+
+               // Sort to put pairable loads together.
+               slices.SortFunc(loads, func(x, y *Value) int {
+                       // First sort by op, ptr, and memory arg.
+                       if x.Op != y.Op {
+                               return int(x.Op - y.Op)
+                       }
+                       if x.Args[0].ID != y.Args[0].ID {
+                               return int(x.Args[0].ID - y.Args[0].ID)
+                       }
+                       if x.Args[1].ID != y.Args[1].ID {
+                               return int(x.Args[1].ID - y.Args[1].ID)
+                       }
+                       // Then sort by aux. (nil first, then by aux ID)
+                       if x.Aux != nil {
+                               if y.Aux == nil {
+                                       return 1
+                               }
+                               a, b := auxID(x.Aux), auxID(y.Aux)
+                               if a != b {
+                                       return a - b
+                               }
+                       } else if y.Aux != nil {
+                               return -1
+                       }
+                       // Then sort by offset, low to high.
+                       return int(x.AuxInt - y.AuxInt)
+               })
+
+               // Look for pairable loads.
+               for i := 0; i < len(loads)-1; i++ {
+                       x := loads[i]
+                       y := loads[i+1]
+                       if x.Op != y.Op || x.Args[0] != y.Args[0] || x.Args[1] != y.Args[1] {
+                               continue
+                       }
+                       if x.Aux != y.Aux {
+                               continue
+                       }
+                       if x.AuxInt+pairableLoads[x.Op].width != y.AuxInt {
+                               continue
+                       }
+
+                       // Commit point.
+
+                       // Make the 2-register load.
+                       load := b.NewValue2IA(x.Pos, pairableLoads[x.Op].pair, types.NewTuple(x.Type, y.Type), x.AuxInt, x.Aux, x.Args[0], x.Args[1])
+
+                       // Modify x to be (Select0 load). Similar for y.
+                       x.reset(OpSelect0)
+                       x.SetArgs1(load)
+                       y.reset(OpSelect1)
+                       y.SetArgs1(load)
+
+                       i++ // Skip y next time around the loop.
+               }
+       }
+}
+
+func pairStores(f *Func) {
+       last := f.Cache.allocBoolSlice(f.NumValues())
+       defer f.Cache.freeBoolSlice(last)
+
+       // prevStore returns the previous store in the
+       // same block, or nil if there are none.
+       prevStore := func(v *Value) *Value {
+               if v.Op == OpInitMem || v.Op == OpPhi {
+                       return nil
+               }
+               m := v.MemoryArg()
+               if m.Block != v.Block {
+                       return nil
+               }
+               return m
+       }
+
+       for _, b := range f.Blocks {
+               // Find last store in block, so we can
+               // walk the stores last to first.
+               // Last to first helps ensure that the rewrites we
+               // perform do not get in the way of subsequent rewrites.
+               for _, v := range b.Values {
+                       if v.Type.IsMemory() {
+                               last[v.ID] = true
+                       }
+               }
+               for _, v := range b.Values {
+                       if v.Type.IsMemory() {
+                               if m := prevStore(v); m != nil {
+                                       last[m.ID] = false
+                               }
+                       }
+               }
+               var lastMem *Value
+               for _, v := range b.Values {
+                       if last[v.ID] {
+                               lastMem = v
+                               break
+                       }
+               }
+
+               // Check all stores, from last to first.
+       memCheck:
+               for v := lastMem; v != nil; v = prevStore(v) {
+                       info := pairableStores[v.Op]
+                       if info.width == 0 {
+                               continue // Not pairable.
+                       }
+                       if !offsetOk(v.Aux, v.AuxInt, info.width) {
+                               continue // Not advisable to pair.
+                       }
+                       ptr := v.Args[0]
+                       val := v.Args[1]
+                       mem := v.Args[2]
+                       off := v.AuxInt
+                       aux := v.Aux
+
+                       // Look for earlier store we can combine with.
+                       lowerOk := true
+                       higherOk := true
+                       count := 10 // max lookback distance
+                       for w := prevStore(v); w != nil; w = prevStore(w) {
+                               if w.Uses != 1 {
+                                       // We can't combine stores if the earlier
+                                       // store has any use besides the next one
+                                       // in the store chain.
+                                       // (Unless we could check the aliasing of
+                                       // all those other uses.)
+                                       continue memCheck
+                               }
+                               if w.Op == v.Op &&
+                                       w.Args[0] == ptr &&
+                                       w.Aux == aux &&
+                                       (lowerOk && w.AuxInt == off-info.width || higherOk && w.AuxInt == off+info.width) {
+                                       // This op is mergeable with v.
+
+                                       // Commit point.
+
+                                       // ptr val1 val2 mem
+                                       args := []*Value{ptr, val, w.Args[1], mem}
+                                       if w.AuxInt == off-info.width {
+                                               args[1], args[2] = args[2], args[1]
+                                               off -= info.width
+                                       }
+                                       v.reset(info.pair)
+                                       v.AddArgs(args...)
+                                       v.Aux = aux
+                                       v.AuxInt = off
+                                       v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
+
+                                       // Make w just a memory copy.
+                                       wmem := w.MemoryArg()
+                                       w.reset(OpCopy)
+                                       w.SetArgs1(wmem)
+                                       continue memCheck
+                               }
+                               if count--; count == 0 {
+                                       // Only look back so far.
+                                       // This keeps us in O(n) territory, and it
+                                       // also prevents us from keeping values
+                                       // in registers for too long (and thus
+                                       // needing to spill them).
+                                       continue memCheck
+                               }
+                               // We're now looking at a store w which is currently
+                               // between the store v that we're intending to merge into,
+                               // and the store we'll eventually find to merge with it.
+                               // Make sure this store doesn't alias with the one
+                               // we'll be moving.
+                               var width int64
+                               switch w.Op {
+                               case OpARM64MOVDstore, OpARM64MOVDstorezero, OpARM64FMOVDstore:
+                                       width = 8
+                               case OpARM64MOVWstore, OpARM64MOVWstorezero, OpARM64FMOVSstore:
+                                       width = 4
+                               case OpARM64MOVHstore, OpARM64MOVHstorezero:
+                                       width = 2
+                               case OpARM64MOVBstore, OpARM64MOVBstorezero:
+                                       width = 1
+                               case OpCopy:
+                                       continue // this was a store we merged earlier
+                               default:
+                                       // Can't reorder with any other memory operations.
+                                       // (atomics, calls, ...)
+                                       continue memCheck
+                               }
+
+                               // We only allow reordering with respect to other
+                               // writes to the same pointer and aux, so we can
+                               // compute the exact the aliasing relationship.
+                               if w.Args[0] != ptr || w.Aux != aux {
+                                       continue memCheck
+                               }
+                               if overlap(w.AuxInt, width, off-info.width, info.width) {
+                                       // Aliases with slot before v's location.
+                                       lowerOk = false
+                               }
+                               if overlap(w.AuxInt, width, off+info.width, info.width) {
+                                       // Aliases with slot after v's location.
+                                       higherOk = false
+                               }
+                               if !higherOk && !lowerOk {
+                                       continue memCheck
+                               }
+                       }
+               }
+       }
+}
index e1cae0e46983e6362d8efe659e83f76b8cdae9fd..2a9cc68ab0e4418caf51a7adcbbceb5f861828c9 100644 (file)
@@ -899,9 +899,11 @@ func store32le(p *struct{ a, b uint32 }, x uint64) {
        p.b = uint32(x >> 32)
 }
 func store32be(p *struct{ a, b uint32 }, x uint64) {
+       // arm64:"STPW"
        // ppc64:"MOVD",-"MOVW",-"SRD"
        // s390x:"MOVD",-"MOVW",-"SRD"
        p.a = uint32(x >> 32)
+       // arm64:-"STPW"
        // ppc64:-"MOVW",-"SRD"
        // s390x:-"MOVW",-"SRD"
        p.b = uint32(x)
@@ -970,3 +972,95 @@ func issue70300Reverse(v uint64) (b [8]byte) {
        b[0] = byte(v)
        return b
 }
+
+// --------------------------------- //
+//    Arm64 double-register loads    //
+// --------------------------------- //
+
+func dwloadI64(p *struct{ a, b int64 }) int64 {
+       // arm64:"LDP\t"
+       return p.a + p.b
+}
+func dwloadI32(p *struct{ a, b int32 }) int32 {
+       // arm64:"LDPW\t"
+       return p.a + p.b
+}
+func dwloadF64(p *struct{ a, b float64 }) float64 {
+       // arm64:"FLDPD\t"
+       return p.a + p.b
+}
+func dwloadF32(p *struct{ a, b float32 }) float32 {
+       // arm64:"FLDPS\t"
+       return p.a + p.b
+}
+
+func dwloadBig(p *struct{ a, b, c, d, e, f int64 }) int64 {
+       // arm64:"LDP\t\\(", "LDP\t16", "LDP\t32"
+       return p.c + p.f + p.a + p.e + p.d + p.b
+}
+
+func dwloadArg(a [2]int64) int64 {
+       // arm64:"LDP\t"
+       return a[0] + a[1]
+}
+
+// ---------------------------------- //
+//    Arm64 double-register stores    //
+// ---------------------------------- //
+
+func dwstoreI64(p *struct{ a, b int64 }, x, y int64) {
+       // arm64:"STP\t"
+       p.a = x
+       p.b = y
+}
+func dwstoreI32(p *struct{ a, b int32 }, x, y int32) {
+       // arm64:"STPW\t"
+       p.a = x
+       p.b = y
+}
+func dwstoreF64(p *struct{ a, b float64 }, x, y float64) {
+       // arm64:"FSTPD\t"
+       p.a = x
+       p.b = y
+}
+func dwstoreF32(p *struct{ a, b float32 }, x, y float32) {
+       // arm64:"FSTPS\t"
+       p.a = x
+       p.b = y
+}
+
+func dwstoreBig(p *struct{ a, b, c, d, e, f int64 }, a, b, c, d, e, f int64) {
+       // This is not perfect. We merge b+a, then d+e, then c and f have no pair.
+       p.c = c
+       p.f = f
+       // arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
+       p.a = a
+       // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 24\(R[0-9]+\)`
+       p.e = e
+       p.d = d
+       p.b = b
+}
+
+func dwstoreRet() [2]int {
+       // arm64:"STP\t"
+       return [2]int{5, 6}
+}
+
+func dwstoreLocal(i int) int64 {
+       var a [2]int64
+       a[0] = 5
+       // arm64:"STP\t"
+       a[1] = 6
+       return a[i]
+}
+
+func dwstoreOrder(p *struct {
+       a, b       int64
+       c, d, e, f bool
+}, a, b int64) {
+       // arm64:"STP\t"
+       p.a = a
+       p.c = true
+       p.e = true
+       p.b = b
+}
index 92ed2492b26e74f7581ec6af2542a9fe2981c86c..d85dfecbb0237e53b911294423c51763cd2706eb 100644 (file)
@@ -9,14 +9,20 @@
 package main
 
 var (
-       e  any
-       ts uint16
+       ga, gb, gc, gd int
 )
 
 func moveValuesWithMemoryArg(len int) {
        for n := 0; n < len; n++ {
-               // Load of e.data is lowed as a MOVDload op, which has a memory
-               // argument. It's moved near where it's used.
-               _ = e != ts // ERROR "MOVDload is moved$" "MOVDaddr is moved$"
+               // Loads of b and d can be delayed until inside the outer "if".
+               a := ga
+               b := gb // ERROR "MOVDload is moved$"
+               c := gc
+               d := gd // ERROR "MOVDload is moved$"
+               if a == c {
+                       if b == d {
+                               return
+                       }
+               }
        }
 }