From: amusman Date: Fri, 23 Aug 2024 21:25:32 +0000 (+0300) Subject: cmd/compile: CSE loads across disjoint stores X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=c34b99a5e20307a55a047543f6b48d8a28d830b5;p=gostls13.git cmd/compile: CSE loads across disjoint stores Enable partitioning together memory user instructions, such as regular loads, across disjoint memory defining instructions (currently only stores). Keep a memory table to remember appropriate memory definition for any supported memory using instruction. This allows to match more load instructions and potentially may be further improved with handling additional cases in the common utility `disjoint`. Generally this change allows to improve code size. For example, here is code size difference on linux_arm64: Executable Old .text New .text Change ------------------------------------------------------- asm 1963124 1961972 -0.06% cgo 1734228 1733140 -0.06% compile 8948740 8948516 -0.00% cover 1864500 1863588 -0.05% link 2555700 2552676 -0.12% preprofile 863636 862980 -0.08% vet 2869220 2867556 -0.06% Some benchmarks result from a local run: shortname: aws_jsonutil pkg: github.com/aws/aws-sdk-go/private/protocol/json/jsonutil │ Orig-rand.stdout │ Cse1-rand.stdout │ │ sec/op │ sec/op vs base │ BuildJSON-8 1.511µ ± 0% 1.516µ ± 0% +0.33% (p=0.003 n=15) StdlibJSON-8 1.254µ ± 0% 1.227µ ± 0% -2.15% (p=0.000 n=15) geomean 1.377µ 1.364µ -0.92% shortname: kanzi toolchain: Cse1-rand goos: linux goarch: arm64 pkg: github.com/flanglet/kanzi-go/benchmark │ Orig-rand.stdout │ Cse1-rand.stdout │ │ sec/op │ sec/op vs base │ FPAQ-4 26.11m ± 0% 25.61m ± 0% -1.93% (p=0.000 n=10) LZ-4 1.461m ± 1% 1.445m ± 1% ~ (p=0.105 n=10) MTFT-4 1.197m ± 0% 1.201m ± 0% +0.36% (p=0.000 n=10) geomean 3.574m 3.543m -0.88% This change also tends to increase number of NilChecks matched, which led to moving statement boundary marks from OpNilCheck to its user instruction (such as OpOffPtr), where it is more likely to be lost during subsequent optimizations - e.g. see #75249. Because we don't remove the nil checks in cse, here we also update it to not move the statement boundary marks from OpNilCheck - the later related optimizations can handle that better. Change-Id: Iddf4aa13d44de78ffecf6ccb4c0fd1d35533e844 Reviewed-on: https://go-review.googlesource.com/c/go/+/608115 LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall Reviewed-by: Keith Randall Reviewed-by: Michael Pratt --- diff --git a/src/cmd/compile/internal/ssa/cse.go b/src/cmd/compile/internal/ssa/cse.go index 28eb4f76a9..44a6ad9f9f 100644 --- a/src/cmd/compile/internal/ssa/cse.go +++ b/src/cmd/compile/internal/ssa/cse.go @@ -85,6 +85,11 @@ func cse(f *Func) { pNum++ } + // Keep a table to remap memory operand of any memory user which does not have a memory result (such as a regular load), + // to some dominating memory operation, skipping the memory defs that do not alias with it. + memTable := f.Cache.allocInt32Slice(f.NumValues()) + defer f.Cache.freeInt32Slice(memTable) + // Split equivalence classes at points where they have // non-equivalent arguments. Repeat until we can't find any // more splits. @@ -108,12 +113,23 @@ func cse(f *Func) { // Sort by eq class of arguments. slices.SortFunc(e, func(v, w *Value) int { + _, idxMem, _, _ := isMemUser(v) for i, a := range v.Args { - b := w.Args[i] - if valueEqClass[a.ID] < valueEqClass[b.ID] { + var aId, bId ID + if i != idxMem { + b := w.Args[i] + aId = a.ID + bId = b.ID + } else { + // A memory user's mem argument may be remapped to allow matching + // identical load-like instructions across disjoint stores. + aId, _ = getEffectiveMemoryArg(memTable, v) + bId, _ = getEffectiveMemoryArg(memTable, w) + } + if valueEqClass[aId] < valueEqClass[bId] { return -1 } - if valueEqClass[a.ID] > valueEqClass[b.ID] { + if valueEqClass[aId] > valueEqClass[bId] { return +1 } } @@ -126,12 +142,23 @@ func cse(f *Func) { v, w := e[j-1], e[j] // Note: commutative args already correctly ordered by byArgClass. eqArgs := true + _, idxMem, _, _ := isMemUser(v) for k, a := range v.Args { if v.Op == OpLocalAddr && k == 1 { continue } - b := w.Args[k] - if valueEqClass[a.ID] != valueEqClass[b.ID] { + var aId, bId ID + if k != idxMem { + b := w.Args[k] + aId = a.ID + bId = b.ID + } else { + // A memory user's mem argument may be remapped to allow matching + // identical load-like instructions across disjoint stores. + aId, _ = getEffectiveMemoryArg(memTable, v) + bId, _ = getEffectiveMemoryArg(memTable, w) + } + if valueEqClass[aId] != valueEqClass[bId] { eqArgs = false break } @@ -180,10 +207,19 @@ func cse(f *Func) { defer f.Cache.freeValueSlice(rewrite) for _, e := range partition { slices.SortFunc(e, func(v, w *Value) int { - c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)) - if c != 0 { + if c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)); c != 0 { return c } + if _, _, _, ok := isMemUser(v); ok { + // Additional ordering among the memory users within one block: prefer the earliest + // possible value among the set of equivalent values, that is the one with the lowest + // skip count (lowest number of memory defs skipped until their common def). + _, vSkips := getEffectiveMemoryArg(memTable, v) + _, wSkips := getEffectiveMemoryArg(memTable, w) + if c := cmp.Compare(vSkips, wSkips); c != 0 { + return c + } + } if v.Op == OpLocalAddr { // compare the memory args for OpLocalAddrs in the same block vm := v.Args[1] @@ -254,7 +290,7 @@ func cse(f *Func) { for _, v := range b.Values { for i, w := range v.Args { if x := rewrite[w.ID]; x != nil { - if w.Pos.IsStmt() == src.PosIsStmt { + if w.Pos.IsStmt() == src.PosIsStmt && w.Op != OpNilCheck { // about to lose a statement marker, w // w is an input to v; if they're in the same block // and the same line, v is a good-enough new statement boundary. @@ -420,3 +456,82 @@ func cmpVal(v, w *Value, auxIDs auxmap) types.Cmp { return types.CMPeq } + +// Query if the given instruction only uses "memory" argument and we may try to skip some memory "defs" if they do not alias with its address. +// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false). +func isMemUser(v *Value) (int, int, int64, bool) { + switch v.Op { + case OpLoad: + return 0, 1, v.Type.Size(), true + case OpNilCheck: + return 0, 1, 0, true + default: + return -1, -1, 0, false + } +} + +// Query if the given "memory"-defining instruction's memory destination can be analyzed for aliasing with a memory "user" instructions. +// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false). +func isMemDef(v *Value) (int, int, int64, bool) { + switch v.Op { + case OpStore: + return 0, 2, auxToType(v.Aux).Size(), true + default: + return -1, -1, 0, false + } +} + +// Mem table keeps memTableSkipBits lower bits to store the number of skips of "memory" operand +// and the rest to store the ID of the destination "memory"-producing instruction. +const memTableSkipBits = 8 + +// The maximum ID value we are able to store in the memTable, otherwise fall back to v.ID +const maxId = ID(1<<(31-memTableSkipBits)) - 1 + +// Return the first possibly-aliased store along the memory chain starting at v's memory argument and the number of not-aliased stores skipped. +func getEffectiveMemoryArg(memTable []int32, v *Value) (ID, uint32) { + if code := uint32(memTable[v.ID]); code != 0 { + return ID(code >> memTableSkipBits), code & ((1 << memTableSkipBits) - 1) + } + if idxPtr, idxMem, width, ok := isMemUser(v); ok { + // TODO: We could early return some predefined value if width==0 + memId := v.Args[idxMem].ID + if memId > maxId { + return memId, 0 + } + mem, skips := skipDisjointMemDefs(v, idxPtr, idxMem, width) + if mem.ID <= maxId { + memId = mem.ID + } else { + skips = 0 // avoid the skip + } + memTable[v.ID] = int32(memId< 50 { + // Skipping a memory def with a lot of uses may potentially increase register pressure. + break + } + defPtr := mem.Args[idxPtr] + if disjoint(defPtr, width, usePtr, useWidth) { + mem = mem.Args[idxMem] + continue + } + } + break + } + return mem, skips +} diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 4b13d65618..04989d93c1 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -944,7 +944,8 @@ func disjointTypes(t1 *types.Type, t2 *types.Type) bool { } if !t1.IsPtr() || !t2.IsPtr() { - panic("disjointTypes: one of arguments is not a pointer") + // Treat non-pointer types (such as TFUNC, TMAP, uintptr) conservatively. + return false } t1 = t1.Elem() diff --git a/test/prove.go b/test/prove.go index e12b6087d3..1ddbc7b3f5 100644 --- a/test/prove.go +++ b/test/prove.go @@ -459,6 +459,24 @@ func f14(p, q *int, a []int) { useInt(a[i2+j]) // ERROR "Proved IsInBounds$" } +func f14mem(q *int, a []int) (r int) { + p := &r + i1 := *q + *p = 1 // CSE of the "q" pointer load across disjoint store to "p" + i2 := *q + useInt(a[i1]) + useInt(a[i2]) // ERROR "Proved IsInBounds$" + return r +} + +func sliceptr(a *[]int, i int) int { + var x, y int + px, py := &x, &y + *px = (*a)[i] + *py = (*a)[i] // ERROR "Proved IsInBounds$" + return x + y +} + func f15(s []int, x int) { useSlice(s[x:]) useSlice(s[:x]) // ERROR "Proved IsSliceInBounds$"