]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: improve stp merging for non-sequent cases
authorMelnikov Denis <melnikov.denis.aleksandrovich@gmail.com>
Thu, 21 Aug 2025 15:00:57 +0000 (18:00 +0300)
committerGopher Robot <gobot@golang.org>
Tue, 9 Sep 2025 19:10:01 +0000 (12:10 -0700)
Original algorithm merges stores with the first
mergeable store in the chain, but it misses some
cases. Additional reordering stores in increasing order
of memory access in the chain allows merging in these cases.

Fixes #71987

There are the results of sweet benchmarks and
the difference between sizes of sections .text

                        │ old.results │            new.results             │
                        │   sec/op    │   sec/op     vs base               │
BleveIndexBatch100-4      7.614 ± 2%    7.548 ± 1%       ~ (p=0.190 n=10)
ESBuildThreeJS-4         821.3m ± 0%   819.0m ± 1%       ~ (p=0.165 n=10)
ESBuildRomeTS-4          206.2m ± 1%   204.4m ± 1%  -0.90% (p=0.023 n=10)
EtcdPut-4                64.89m ± 1%   64.94m ± 2%       ~ (p=0.684 n=10)
EtcdSTM-4                318.4m ± 0%   319.2m ± 1%       ~ (p=0.631 n=10)
GoBuildKubelet-4          157.4 ± 0%    157.6 ± 0%       ~ (p=0.105 n=10)
GoBuildKubeletLink-4      12.42 ± 2%    12.41 ± 1%       ~ (p=0.529 n=10)
GoBuildIstioctl-4         124.4 ± 0%    124.4 ± 0%       ~ (p=0.579 n=10)
GoBuildIstioctlLink-4     8.700 ± 1%    8.693 ± 1%       ~ (p=0.912 n=10)
GoBuildFrontend-4         46.52 ± 0%    46.50 ± 0%       ~ (p=0.971 n=10)
GoBuildFrontendLink-4     2.282 ± 1%    2.272 ± 1%       ~ (p=0.529 n=10)
GoBuildTsgo-4             75.02 ± 1%    75.31 ± 1%       ~ (p=0.436 n=10)
GoBuildTsgoLink-4         1.229 ± 1%    1.219 ± 1%  -0.82% (p=0.035 n=10)
GopherLuaKNucleotide-4    34.77 ± 5%    34.31 ± 1%  -1.33% (p=0.015 n=10)
MarkdownRenderXHTML-4    286.6m ± 0%   285.7m ± 1%       ~ (p=0.315 n=10)
Tile38QueryLoad-4        657.2µ ± 1%   660.3µ ± 0%       ~ (p=0.436 n=10)
geomean                   2.570         2.563       -0.24%

Executable            Old .text  New .text     Change
-------------------------------------------------------
benchmark               6504820    6504020     -0.01%
bleve-index-bench       3903860    3903636     -0.01%
esbuild                 4801012    4801172     +0.00%
esbuild-bench           1256404    1256340     -0.01%
etcd                    9188148    9187076     -0.01%
etcd-bench              6462228    6461524     -0.01%
go                      5924468    5923892     -0.01%
go-build-bench          1282004    1281940     -0.00%
gopher-lua-bench        1639540    1639348     -0.01%
markdown-bench          1478452    1478356     -0.01%
tile38-bench            2753524    2753300     -0.01%
tile38-server          10241380   10240068     -0.01%

Change-Id: Ieb4fdfd656aca458f65fc45938de70550632bd13
Reviewed-on: https://go-review.googlesource.com/c/go/+/698097
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Mark Freeman <markfreeman@google.com>
Reviewed-by: Keith Randall <khr@google.com>
src/cmd/compile/internal/ssa/pair.go
test/codegen/memcombine.go

index 83d7e476dd9c20712568715f1b8297a040881602..0aa70e416f8f324d9c54d9f91c1c3fd06d52c514 100644 (file)
@@ -212,6 +212,12 @@ func pairStores(f *Func) {
        last := f.Cache.allocBoolSlice(f.NumValues())
        defer f.Cache.freeBoolSlice(last)
 
+       type stChainElem struct {
+               v *Value
+               i int // Index in chain (0 == last store)
+       }
+       var order []stChainElem
+
        // prevStore returns the previous store in the
        // same block, or nil if there are none.
        prevStore := func(v *Value) *Value {
@@ -225,6 +231,27 @@ func pairStores(f *Func) {
                return m
        }
 
+       // storeWidth returns the width of store,
+       // or 0 if it is not a store
+       storeWidth := func(op Op) int64 {
+               var width int64
+               switch op {
+               case OpARM64MOVDstore, OpARM64FMOVDstore:
+                       width = 8
+               case OpARM64MOVWstore, OpARM64FMOVSstore:
+                       width = 4
+               case OpARM64MOVHstore:
+                       width = 2
+               case OpARM64MOVBstore:
+                       width = 1
+               default:
+                       width = 0
+               }
+               return width
+       }
+
+       const limit = 10
+
        for _, b := range f.Blocks {
                // Find last store in block, so we can
                // walk the stores last to first.
@@ -250,9 +277,84 @@ func pairStores(f *Func) {
                        }
                }
 
+               order = order[:0]
+               for i, v := 0, lastMem; v != nil; v = prevStore(v) {
+                       order = append(order, stChainElem{v, i})
+                       i++
+               }
+       reordering:
+               for i, v_elem := range order {
+                       v := v_elem.v
+                       if v.Uses != 1 {
+                               // We can't reorder stores if the earlier
+                               // store has any use besides the next one
+                               // in the store chain.
+                               // (Unless we could check the aliasing of
+                               // all those other uses.)
+                               continue
+                       }
+                       widthV := storeWidth(v.Op)
+                       if widthV == 0 {
+                               // Can't reorder with any other memory operations.
+                               // (atomics, calls, ...)
+                               continue
+                       }
+                       chain := order[i+1:]
+                       count := limit
+                       // Var 'count' keeps us in O(n) territory
+                       for j, w_elem := range chain {
+                               if count--; count == 0 {
+                                       // Only look back so far.
+                                       // This keeps us in O(n) territory, and it
+                                       // also prevents us from keeping values
+                                       // in registers for too long (and thus
+                                       // needing to spill them).
+                                       continue reordering
+                               }
+
+                               w := w_elem.v
+                               if w.Uses != 1 {
+                                       // We can't reorder stores if the earlier
+                                       // store has any use besides the next one
+                                       // in the store chain.
+                                       // (Unless we could check the aliasing of
+                                       // all those other uses.)
+                                       continue reordering
+                               }
+
+                               widthW := storeWidth(w.Op)
+                               if widthW == 0 {
+                                       // Can't reorder with any other memory operations.
+                                       // (atomics, calls, ...)
+                                       continue reordering
+                               }
+
+                               // We only allow reordering with respect to other
+                               // writes to the same pointer and aux, so we can
+                               // compute the exact the aliasing relationship.
+                               if w.Args[0] != v.Args[0] ||
+                                       w.Aux != v.Aux {
+                                       // Can't reorder with operation with incomparable destination memory pointer.
+                                       continue reordering
+                               }
+                               if overlap(w.AuxInt, widthW, v.AuxInt, widthV) {
+                                       // Aliases with the same slot with v's location.
+                                       continue reordering
+                               }
+
+                               // Reordering stores in increasing order of memory access
+                               if v.AuxInt < w.AuxInt {
+                                       order[i], order[i+j+1] = order[i+j+1], order[i]
+                                       v = w
+                                       widthV = widthW
+                               }
+                       }
+               }
+
                // Check all stores, from last to first.
        memCheck:
-               for v := lastMem; v != nil; v = prevStore(v) {
+               for i, v_elem := range order {
+                       v := v_elem.v
                        info := pairableStores[v.Op]
                        if info.width == 0 {
                                continue // Not pairable.
@@ -269,8 +371,10 @@ func pairStores(f *Func) {
                        // Look for earlier store we can combine with.
                        lowerOk := true
                        higherOk := true
-                       count := 10 // max lookback distance
-                       for w := prevStore(v); w != nil; w = prevStore(w) {
+                       count := limit // max lookback distance
+                       chain := order[i+1:]
+                       for _, w_elem := range chain {
+                               w := w_elem.v
                                if w.Uses != 1 {
                                        // We can't combine stores if the earlier
                                        // store has any use besides the next one
@@ -293,11 +397,17 @@ func pairStores(f *Func) {
                                                args[1], args[2] = args[2], args[1]
                                                off -= info.width
                                        }
+
                                        v.reset(info.pair)
                                        v.AddArgs(args...)
                                        v.Aux = aux
                                        v.AuxInt = off
-                                       v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
+                                       // Take position of earlier of the two stores
+                                       if v_elem.i < w_elem.i {
+                                               v.Pos = w.Pos
+                                       } else {
+                                               w.Pos = v.Pos
+                                       }
 
                                        // Make w just a memory copy.
                                        wmem := w.MemoryArg()
index fa0e902ac2b485834f08a226c4a0ba528428d6c7..9457d15f6a431ba0a43d9841d19eda3981a8b0b5 100644 (file)
@@ -1053,17 +1053,34 @@ func dwstoreF32(p *struct{ a, b float32 }, x, y float32) {
 }
 
 func dwstoreBig(p *struct{ a, b, c, d, e, f int64 }, a, b, c, d, e, f int64) {
-       // This is not perfect. We merge b+a, then d+e, then c and f have no pair.
+       // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 16\(R[0-9]+\)`
        p.c = c
+       // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 32\(R[0-9]+\)`
        p.f = f
        // arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
        p.a = a
-       // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 24\(R[0-9]+\)`
        p.e = e
        p.d = d
        p.b = b
 }
 
+func dwstoreUnorderedArray(p *struct{ a, b, c, d int }, a, b, c, d int) {
+       // arm64:`STP\s\(R[0-9]+, R[0-9]+\), 16\(R[0-9]+\)`
+       p.c = c
+       p.d = d
+       // arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
+       p.a = a
+       p.b = b
+}
+
+func dwstoreBigNil(p *struct{ i, j struct{ a, b, c int } }) {
+       // arm64:`STP\s\(ZR, ZR\), 32\(R[0-9]+\)`
+       // arm64:`STP\s\(ZR, ZR\), 16\(R[0-9]+\)`
+       p.j = struct{ a, b, c int }{}
+       // arm64:`STP\s\(ZR, ZR\), \(R[0-9]+\)`
+       p.i = struct{ a, b, c int }{}
+}
+
 func dwstoreRet() [2]int {
        // arm64:"STP\t"
        return [2]int{5, 6}