From: amusman <alexander.musman@gmail.com>
Date: Fri, 23 Aug 2024 21:25:32 +0000 (+0300)
Subject: cmd/compile: CSE loads across disjoint stores
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=c34b99a5e20307a55a047543f6b48d8a28d830b5;p=gostls13.git

cmd/compile: CSE loads across disjoint stores

Enable partitioning together memory user instructions, such as regular
loads, across disjoint memory defining instructions (currently only stores).
Keep a memory table to remember appropriate memory definition for any
supported memory using instruction. This allows to match more load
instructions and potentially may be further improved with handling
additional cases in the common utility `disjoint`.

Generally this change allows to improve code size. For example, here is
code size difference on linux_arm64:

Executable            Old .text  New .text     Change
-------------------------------------------------------
asm                     1963124    1961972     -0.06%
cgo                     1734228    1733140     -0.06%
compile                 8948740    8948516     -0.00%
cover                   1864500    1863588     -0.05%
link                    2555700    2552676     -0.12%
preprofile               863636     862980     -0.08%
vet                     2869220    2867556     -0.06%

Some benchmarks result from a local run:

shortname: aws_jsonutil
pkg: github.com/aws/aws-sdk-go/private/protocol/json/jsonutil
             │ Orig-rand.stdout │          Cse1-rand.stdout          │
             │      sec/op      │   sec/op     vs base               │
BuildJSON-8         1.511µ ± 0%   1.516µ ± 0%  +0.33% (p=0.003 n=15)
StdlibJSON-8        1.254µ ± 0%   1.227µ ± 0%  -2.15% (p=0.000 n=15)
geomean             1.377µ        1.364µ       -0.92%

shortname: kanzi
toolchain: Cse1-rand
goos: linux
goarch: arm64
pkg: github.com/flanglet/kanzi-go/benchmark
        │ Orig-rand.stdout │          Cse1-rand.stdout          │
        │      sec/op      │   sec/op     vs base               │
FPAQ-4         26.11m ± 0%   25.61m ± 0%  -1.93% (p=0.000 n=10)
LZ-4           1.461m ± 1%   1.445m ± 1%       ~ (p=0.105 n=10)
MTFT-4         1.197m ± 0%   1.201m ± 0%  +0.36% (p=0.000 n=10)
geomean        3.574m        3.543m       -0.88%

This change also tends to increase number of NilChecks matched, which
led to moving statement boundary marks from OpNilCheck to its user
instruction (such as OpOffPtr), where it is more likely to be lost
during subsequent optimizations - e.g. see #75249. Because we don't
remove the nil checks in cse, here we also update it to not move
the statement boundary marks from OpNilCheck - the later related
optimizations can handle that better.

Change-Id: Iddf4aa13d44de78ffecf6ccb4c0fd1d35533e844
Reviewed-on: https://go-review.googlesource.com/c/go/+/608115
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---

diff --git a/src/cmd/compile/internal/ssa/cse.go b/src/cmd/compile/internal/ssa/cse.go
index 28eb4f76a9..44a6ad9f9f 100644
--- a/src/cmd/compile/internal/ssa/cse.go
+++ b/src/cmd/compile/internal/ssa/cse.go
@@ -85,6 +85,11 @@ func cse(f *Func) {
 		pNum++
 	}
 
+	// Keep a table to remap memory operand of any memory user which does not have a memory result (such as a regular load),
+	// to some dominating memory operation, skipping the memory defs that do not alias with it.
+	memTable := f.Cache.allocInt32Slice(f.NumValues())
+	defer f.Cache.freeInt32Slice(memTable)
+
 	// Split equivalence classes at points where they have
 	// non-equivalent arguments.  Repeat until we can't find any
 	// more splits.
@@ -108,12 +113,23 @@ func cse(f *Func) {
 
 			// Sort by eq class of arguments.
 			slices.SortFunc(e, func(v, w *Value) int {
+				_, idxMem, _, _ := isMemUser(v)
 				for i, a := range v.Args {
-					b := w.Args[i]
-					if valueEqClass[a.ID] < valueEqClass[b.ID] {
+					var aId, bId ID
+					if i != idxMem {
+						b := w.Args[i]
+						aId = a.ID
+						bId = b.ID
+					} else {
+						// A memory user's mem argument may be remapped to allow matching
+						// identical load-like instructions across disjoint stores.
+						aId, _ = getEffectiveMemoryArg(memTable, v)
+						bId, _ = getEffectiveMemoryArg(memTable, w)
+					}
+					if valueEqClass[aId] < valueEqClass[bId] {
 						return -1
 					}
-					if valueEqClass[a.ID] > valueEqClass[b.ID] {
+					if valueEqClass[aId] > valueEqClass[bId] {
 						return +1
 					}
 				}
@@ -126,12 +142,23 @@ func cse(f *Func) {
 				v, w := e[j-1], e[j]
 				// Note: commutative args already correctly ordered by byArgClass.
 				eqArgs := true
+				_, idxMem, _, _ := isMemUser(v)
 				for k, a := range v.Args {
 					if v.Op == OpLocalAddr && k == 1 {
 						continue
 					}
-					b := w.Args[k]
-					if valueEqClass[a.ID] != valueEqClass[b.ID] {
+					var aId, bId ID
+					if k != idxMem {
+						b := w.Args[k]
+						aId = a.ID
+						bId = b.ID
+					} else {
+						// A memory user's mem argument may be remapped to allow matching
+						// identical load-like instructions across disjoint stores.
+						aId, _ = getEffectiveMemoryArg(memTable, v)
+						bId, _ = getEffectiveMemoryArg(memTable, w)
+					}
+					if valueEqClass[aId] != valueEqClass[bId] {
 						eqArgs = false
 						break
 					}
@@ -180,10 +207,19 @@ func cse(f *Func) {
 	defer f.Cache.freeValueSlice(rewrite)
 	for _, e := range partition {
 		slices.SortFunc(e, func(v, w *Value) int {
-			c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block))
-			if c != 0 {
+			if c := cmp.Compare(sdom.domorder(v.Block), sdom.domorder(w.Block)); c != 0 {
 				return c
 			}
+			if _, _, _, ok := isMemUser(v); ok {
+				// Additional ordering among the memory users within one block: prefer the earliest
+				// possible value among the set of equivalent values, that is the one with the lowest
+				// skip count (lowest number of memory defs skipped until their common def).
+				_, vSkips := getEffectiveMemoryArg(memTable, v)
+				_, wSkips := getEffectiveMemoryArg(memTable, w)
+				if c := cmp.Compare(vSkips, wSkips); c != 0 {
+					return c
+				}
+			}
 			if v.Op == OpLocalAddr {
 				// compare the memory args for OpLocalAddrs in the same block
 				vm := v.Args[1]
@@ -254,7 +290,7 @@ func cse(f *Func) {
 		for _, v := range b.Values {
 			for i, w := range v.Args {
 				if x := rewrite[w.ID]; x != nil {
-					if w.Pos.IsStmt() == src.PosIsStmt {
+					if w.Pos.IsStmt() == src.PosIsStmt && w.Op != OpNilCheck {
 						// about to lose a statement marker, w
 						// w is an input to v; if they're in the same block
 						// and the same line, v is a good-enough new statement boundary.
@@ -420,3 +456,82 @@ func cmpVal(v, w *Value, auxIDs auxmap) types.Cmp {
 
 	return types.CMPeq
 }
+
+// Query if the given instruction only uses "memory" argument and we may try to skip some memory "defs" if they do not alias with its address.
+// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false).
+func isMemUser(v *Value) (int, int, int64, bool) {
+	switch v.Op {
+	case OpLoad:
+		return 0, 1, v.Type.Size(), true
+	case OpNilCheck:
+		return 0, 1, 0, true
+	default:
+		return -1, -1, 0, false
+	}
+}
+
+// Query if the given "memory"-defining instruction's memory destination can be analyzed for aliasing with a memory "user" instructions.
+// Return index of pointer argument, index of "memory" argument, the access width and true on such instructions, otherwise return (-1, -1, 0, false).
+func isMemDef(v *Value) (int, int, int64, bool) {
+	switch v.Op {
+	case OpStore:
+		return 0, 2, auxToType(v.Aux).Size(), true
+	default:
+		return -1, -1, 0, false
+	}
+}
+
+// Mem table keeps memTableSkipBits lower bits to store the number of skips of "memory" operand
+// and the rest to store the ID of the destination "memory"-producing instruction.
+const memTableSkipBits = 8
+
+// The maximum ID value we are able to store in the memTable, otherwise fall back to v.ID
+const maxId = ID(1<<(31-memTableSkipBits)) - 1
+
+// Return the first possibly-aliased store along the memory chain starting at v's memory argument and the number of not-aliased stores skipped.
+func getEffectiveMemoryArg(memTable []int32, v *Value) (ID, uint32) {
+	if code := uint32(memTable[v.ID]); code != 0 {
+		return ID(code >> memTableSkipBits), code & ((1 << memTableSkipBits) - 1)
+	}
+	if idxPtr, idxMem, width, ok := isMemUser(v); ok {
+		// TODO: We could early return some predefined value if width==0
+		memId := v.Args[idxMem].ID
+		if memId > maxId {
+			return memId, 0
+		}
+		mem, skips := skipDisjointMemDefs(v, idxPtr, idxMem, width)
+		if mem.ID <= maxId {
+			memId = mem.ID
+		} else {
+			skips = 0 // avoid the skip
+		}
+		memTable[v.ID] = int32(memId<<memTableSkipBits) | int32(skips)
+		return memId, skips
+	} else {
+		v.Block.Func.Fatalf("expected memory user instruction: %v", v.LongString())
+	}
+	return 0, 0
+}
+
+// Find a memory def that's not trivially disjoint with the user instruction, count the number
+// of "skips" along the path. Return the corresponding memory def's value and the number of skips.
+func skipDisjointMemDefs(user *Value, idxUserPtr, idxUserMem int, useWidth int64) (*Value, uint32) {
+	usePtr, mem := user.Args[idxUserPtr], user.Args[idxUserMem]
+	const maxSkips = (1 << memTableSkipBits) - 1
+	var skips uint32
+	for skips = 0; skips < maxSkips; skips++ {
+		if idxPtr, idxMem, width, ok := isMemDef(mem); ok {
+			if mem.Args[idxMem].Uses > 50 {
+				// Skipping a memory def with a lot of uses may potentially increase register pressure.
+				break
+			}
+			defPtr := mem.Args[idxPtr]
+			if disjoint(defPtr, width, usePtr, useWidth) {
+				mem = mem.Args[idxMem]
+				continue
+			}
+		}
+		break
+	}
+	return mem, skips
+}
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index 4b13d65618..04989d93c1 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -944,7 +944,8 @@ func disjointTypes(t1 *types.Type, t2 *types.Type) bool {
 	}
 
 	if !t1.IsPtr() || !t2.IsPtr() {
-		panic("disjointTypes: one of arguments is not a pointer")
+		// Treat non-pointer types (such as TFUNC, TMAP, uintptr) conservatively.
+		return false
 	}
 
 	t1 = t1.Elem()
diff --git a/test/prove.go b/test/prove.go
index e12b6087d3..1ddbc7b3f5 100644
--- a/test/prove.go
+++ b/test/prove.go
@@ -459,6 +459,24 @@ func f14(p, q *int, a []int) {
 	useInt(a[i2+j]) // ERROR "Proved IsInBounds$"
 }
 
+func f14mem(q *int, a []int) (r int) {
+	p := &r
+	i1 := *q
+	*p = 1 // CSE of the "q" pointer load across disjoint store to "p"
+	i2 := *q
+	useInt(a[i1])
+	useInt(a[i2]) // ERROR "Proved IsInBounds$"
+	return r
+}
+
+func sliceptr(a *[]int, i int) int {
+	var x, y int
+	px, py := &x, &y
+	*px = (*a)[i]
+	*py = (*a)[i] // ERROR "Proved IsInBounds$"
+	return x + y
+}
+
 func f15(s []int, x int) {
 	useSlice(s[x:])
 	useSlice(s[:x]) // ERROR "Proved IsSliceInBounds$"