"cmd/internal/obj"
)
+// maxShadowRanges bounds the number of disjoint byte intervals
+// we track per pointer to avoid quadratic behaviour.
+const maxShadowRanges = 64
+
// dse does dead-store elimination on the Function.
// Dead stores are those which are unconditionally followed by
// another store to the same location, with no intervening load.
defer f.retSparseMap(shadowed)
// localAddrs maps from a local variable (the Aux field of a LocalAddr value) to an instance of a LocalAddr value for that variable in the current block.
localAddrs := map[any]*Value{}
+
+ // shadowedRanges stores the actual range data. The 'shadowed' sparseMap stores a 1-based index into this slice.
+ var shadowedRanges []*shadowRanges
+
for _, b := range f.Blocks {
// Find all the stores in this block. Categorize their uses:
// loadUse contains stores which are used by a subsequent load.
// Walk backwards looking for dead stores. Keep track of shadowed addresses.
// A "shadowed address" is a pointer, offset, and size describing a memory region that
// is known to be written. We keep track of shadowed addresses in the shadowed map,
- // mapping the ID of the address to a shadowRange where future writes will happen.
+ // mapping the ID of the address to a shadowRanges where future writes will happen.
// Since we're walking backwards, writes to a shadowed region are useless,
// as they will be immediately overwritten.
shadowed.clear()
+ shadowedRanges = shadowedRanges[:0]
v := last
walkloop:
// Someone might be reading this memory state.
// Clear all shadowed addresses.
shadowed.clear()
+ shadowedRanges = shadowedRanges[:0]
}
if v.Op == OpStore || v.Op == OpZero {
ptr := v.Args[0]
ptr = la
}
}
- srNum, _ := shadowed.get(ptr.ID)
- sr := shadowRange(srNum)
- if sr.contains(off, off+sz) {
+ var si *shadowRanges
+ idx, ok := shadowed.get(ptr.ID)
+ if ok {
+ // The sparseMap stores a 1-based index, so we subtract 1.
+ si = shadowedRanges[idx-1]
+ }
+
+ if si != nil && si.contains(off, off+sz) {
// Modify the store/zero into a copy of the memory state,
// effectively eliding the store operation.
if v.Op == OpStore {
v.Op = OpCopy
} else {
// Extend shadowed region.
- shadowed.set(ptr.ID, int32(sr.merge(off, off+sz)))
+ if si == nil {
+ si = &shadowRanges{}
+ shadowedRanges = append(shadowedRanges, si)
+ // Store a 1-based index in the sparseMap.
+ shadowed.set(ptr.ID, int32(len(shadowedRanges)))
+ }
+ si.add(off, off+sz)
}
}
// walk to previous store
}
}
-// A shadowRange encodes a set of byte offsets [lo():hi()] from
-// a given pointer that will be written to later in the block.
-// A zero shadowRange encodes an empty shadowed range.
-type shadowRange int32
-
-func (sr shadowRange) lo() int64 {
- return int64(sr & 0xffff)
+// shadowRange represents a single byte range [lo,hi] that will be written.
+type shadowRange struct {
+ lo, hi uint16
}
-func (sr shadowRange) hi() int64 {
- return int64((sr >> 16) & 0xffff)
+// shadowRanges stores an unordered collection of disjoint byte ranges.
+type shadowRanges struct {
+ ranges []shadowRange
}
// contains reports whether [lo:hi] is completely within sr.
-func (sr shadowRange) contains(lo, hi int64) bool {
- return lo >= sr.lo() && hi <= sr.hi()
+func (sr *shadowRanges) contains(lo, hi int64) bool {
+ for _, r := range sr.ranges {
+ if lo >= int64(r.lo) && hi <= int64(r.hi) {
+ return true
+ }
+ }
+ return false
}
-// merge returns the union of sr and [lo:hi].
-// merge is allowed to return something smaller than the union.
-func (sr shadowRange) merge(lo, hi int64) shadowRange {
- if lo < 0 || hi > 0xffff {
- // Ignore offsets that are too large or small.
- return sr
- }
- if sr.lo() == sr.hi() {
- // Old range is empty - use new one.
- return shadowRange(lo + hi<<16)
+func (sr *shadowRanges) add(lo, hi int64) {
+ // Ignore the store if:
+ // - the range doesn't fit in 16 bits, or
+ // - we already track maxShadowRanges intervals.
+ // The cap prevents a theoretical O(n^2) blow-up.
+ if lo < 0 || hi > 0xffff || len(sr.ranges) >= maxShadowRanges {
+ return
}
- if hi < sr.lo() || lo > sr.hi() {
- // The two regions don't overlap or abut, so we would
- // have to keep track of multiple disjoint ranges.
- // Because we can only keep one, keep the larger one.
- if sr.hi()-sr.lo() >= hi-lo {
- return sr
+ nlo := lo
+ nhi := hi
+ out := sr.ranges[:0]
+
+ for _, r := range sr.ranges {
+ if nhi < int64(r.lo) || nlo > int64(r.hi) {
+ out = append(out, r)
+ continue
+ }
+ if int64(r.lo) < nlo {
+ nlo = int64(r.lo)
+ }
+ if int64(r.hi) > nhi {
+ nhi = int64(r.hi)
}
- return shadowRange(lo + hi<<16)
}
- // Regions overlap or abut - compute the union.
- return shadowRange(min(lo, sr.lo()) + max(hi, sr.hi())<<16)
+ sr.ranges = append(out, shadowRange{uint16(nlo), uint16(nhi)})
}
// elimDeadAutosGeneric deletes autos that are never accessed. To achieve this
import (
"cmd/compile/internal/types"
"cmd/internal/src"
+ "fmt"
+ "sort"
"testing"
)
t.Errorf("dead store not removed")
}
}
+
+func TestDeadStoreArrayGap(t *testing.T) {
+ c := testConfig(t)
+ ptr := c.config.Types.BytePtr
+ i64 := c.config.Types.Int64
+
+ typ := types.NewArray(i64, 5)
+ tmp := c.Temp(typ)
+
+ fun := c.Fun("entry",
+ Bloc("entry",
+ Valu("start", OpInitMem, types.TypeMem, 0, nil),
+ Valu("sp", OpSP, c.config.Types.Uintptr, 0, nil),
+
+ Valu("base", OpLocalAddr, ptr, 0, tmp, "sp", "start"),
+
+ Valu("p0", OpOffPtr, ptr, 0, nil, "base"),
+ Valu("p1", OpOffPtr, ptr, 8, nil, "base"),
+ Valu("p2", OpOffPtr, ptr, 16, nil, "base"),
+ Valu("p3", OpOffPtr, ptr, 24, nil, "base"),
+ Valu("p4", OpOffPtr, ptr, 32, nil, "base"),
+
+ Valu("one", OpConst64, i64, 1, nil),
+ Valu("seven", OpConst64, i64, 7, nil),
+ Valu("zero", OpConst64, i64, 0, nil),
+
+ Valu("mem0", OpZero, types.TypeMem, 40, typ, "base", "start"),
+
+ Valu("s0", OpStore, types.TypeMem, 0, i64, "p0", "one", "mem0"),
+ Valu("s1", OpStore, types.TypeMem, 0, i64, "p1", "seven", "s0"),
+ Valu("s2", OpStore, types.TypeMem, 0, i64, "p3", "one", "s1"),
+ Valu("s3", OpStore, types.TypeMem, 0, i64, "p4", "one", "s2"),
+ Valu("s4", OpStore, types.TypeMem, 0, i64, "p2", "zero", "s3"),
+
+ Goto("exit")),
+ Bloc("exit",
+ Exit("s4")))
+
+ CheckFunc(fun.f)
+ dse(fun.f)
+ CheckFunc(fun.f)
+
+ if op := fun.values["mem0"].Op; op != OpCopy {
+ t.Fatalf("dead Zero not removed: got %s, want OpCopy", op)
+ }
+}
+
+func TestShadowRanges(t *testing.T) {
+ t.Run("simple insert & contains", func(t *testing.T) {
+ var sr shadowRanges
+ sr.add(10, 20)
+
+ wantRanges(t, sr.ranges, [][2]uint16{{10, 20}})
+ if !sr.contains(12, 18) || !sr.contains(10, 20) {
+ t.Fatalf("contains failed after simple add")
+ }
+ if sr.contains(9, 11) || sr.contains(11, 21) {
+ t.Fatalf("contains erroneously true for non-contained range")
+ }
+ })
+
+ t.Run("merge overlapping", func(t *testing.T) {
+ var sr shadowRanges
+ sr.add(10, 20)
+ sr.add(15, 25)
+
+ wantRanges(t, sr.ranges, [][2]uint16{{10, 25}})
+ if !sr.contains(13, 24) {
+ t.Fatalf("contains should be true after merge")
+ }
+ })
+
+ t.Run("merge touching boundary", func(t *testing.T) {
+ var sr shadowRanges
+ sr.add(100, 150)
+ // touches at 150 - should coalesce
+ sr.add(150, 180)
+
+ wantRanges(t, sr.ranges, [][2]uint16{{100, 180}})
+ })
+
+ t.Run("union across several ranges", func(t *testing.T) {
+ var sr shadowRanges
+ sr.add(10, 20)
+ sr.add(30, 40)
+ // bridges second, not first
+ sr.add(25, 35)
+
+ wantRanges(t, sr.ranges, [][2]uint16{{10, 20}, {25, 40}})
+
+ // envelops everything
+ sr.add(5, 50)
+ wantRanges(t, sr.ranges, [][2]uint16{{5, 50}})
+ })
+
+ t.Run("disjoint intervals stay separate", func(t *testing.T) {
+ var sr shadowRanges
+ sr.add(10, 20)
+ sr.add(22, 30)
+
+ wantRanges(t, sr.ranges, [][2]uint16{{10, 20}, {22, 30}})
+ // spans both
+ if sr.contains(15, 25) {
+ t.Fatalf("contains across two disjoint ranges should be false")
+ }
+ })
+
+ t.Run("large uint16 offsets still work", func(t *testing.T) {
+ var sr shadowRanges
+ sr.add(40000, 45000)
+
+ if !sr.contains(42000, 43000) {
+ t.Fatalf("contains failed for large uint16 values")
+ }
+ })
+
+ t.Run("out-of-bounds inserts ignored", func(t *testing.T) {
+ var sr shadowRanges
+ sr.add(10, 20)
+ sr.add(-5, 5)
+ sr.add(70000, 70010)
+
+ wantRanges(t, sr.ranges, [][2]uint16{{10, 20}})
+ })
+}
+
+// canonicalise order for comparisons
+func sortRanges(r []shadowRange) {
+ sort.Slice(r, func(i, j int) bool { return r[i].lo < r[j].lo })
+}
+
+// compare actual slice with expected pairs
+func wantRanges(t *testing.T, got []shadowRange, want [][2]uint16) {
+ t.Helper()
+ sortRanges(got)
+
+ if len(got) != len(want) {
+ t.Fatalf("len(ranges)=%d, want %d (got=%v)", len(got), len(want), got)
+ }
+
+ for i, w := range want {
+ if got[i].lo != w[0] || got[i].hi != w[1] {
+ t.Fatalf("range %d = [%d,%d], want [%d,%d] (full=%v)",
+ i, got[i].lo, got[i].hi, w[0], w[1], got)
+ }
+ }
+}
+
+func BenchmarkDeadStore(b *testing.B) {
+ cfg := testConfig(b)
+ ptr := cfg.config.Types.BytePtr
+
+ f := cfg.Fun("entry",
+ Bloc("entry",
+ Valu("start", OpInitMem, types.TypeMem, 0, nil),
+ Valu("sb", OpSB, cfg.config.Types.Uintptr, 0, nil),
+ Valu("v", OpConstBool, cfg.config.Types.Bool, 1, nil),
+ Valu("a1", OpAddr, ptr, 0, nil, "sb"),
+ Valu("a2", OpAddr, ptr, 0, nil, "sb"),
+ Valu("a3", OpAddr, ptr, 0, nil, "sb"),
+ Valu("z1", OpZero, types.TypeMem, 1, cfg.config.Types.Bool, "a3", "start"),
+ Valu("s1", OpStore, types.TypeMem, 0, cfg.config.Types.Bool, "a1", "v", "z1"),
+ Valu("s2", OpStore, types.TypeMem, 0, cfg.config.Types.Bool, "a2", "v", "s1"),
+ Valu("s3", OpStore, types.TypeMem, 0, cfg.config.Types.Bool, "a1", "v", "s2"),
+ Valu("s4", OpStore, types.TypeMem, 0, cfg.config.Types.Bool, "a3", "v", "s3"),
+ Goto("exit")),
+ Bloc("exit",
+ Exit("s3")))
+
+ runBench(b, func() {
+ dse(f.f)
+ })
+}
+
+func BenchmarkDeadStorePhi(b *testing.B) {
+ cfg := testConfig(b)
+ ptr := cfg.config.Types.BytePtr
+
+ f := cfg.Fun("entry",
+ Bloc("entry",
+ Valu("start", OpInitMem, types.TypeMem, 0, nil),
+ Valu("sb", OpSB, cfg.config.Types.Uintptr, 0, nil),
+ Valu("v", OpConstBool, cfg.config.Types.Bool, 1, nil),
+ Valu("addr", OpAddr, ptr, 0, nil, "sb"),
+ Goto("loop")),
+ Bloc("loop",
+ Valu("phi", OpPhi, types.TypeMem, 0, nil, "start", "store"),
+ Valu("store", OpStore, types.TypeMem, 0, cfg.config.Types.Bool, "addr", "v", "phi"),
+ If("v", "loop", "exit")),
+ Bloc("exit",
+ Exit("store")))
+
+ runBench(b, func() {
+ dse(f.f)
+ })
+}
+
+func BenchmarkDeadStoreTypes(b *testing.B) {
+ cfg := testConfig(b)
+
+ t1 := cfg.config.Types.UInt64.PtrTo()
+ t2 := cfg.config.Types.UInt32.PtrTo()
+
+ f := cfg.Fun("entry",
+ Bloc("entry",
+ Valu("start", OpInitMem, types.TypeMem, 0, nil),
+ Valu("sb", OpSB, cfg.config.Types.Uintptr, 0, nil),
+ Valu("v", OpConstBool, cfg.config.Types.Bool, 1, nil),
+ Valu("a1", OpAddr, t1, 0, nil, "sb"),
+ Valu("a2", OpAddr, t2, 0, nil, "sb"),
+ Valu("s1", OpStore, types.TypeMem, 0, cfg.config.Types.Bool, "a1", "v", "start"),
+ Valu("s2", OpStore, types.TypeMem, 0, cfg.config.Types.Bool, "a2", "v", "s1"),
+ Goto("exit")),
+ Bloc("exit",
+ Exit("s2")))
+ cse(f.f)
+
+ runBench(b, func() {
+ dse(f.f)
+ })
+}
+
+func BenchmarkDeadStoreUnsafe(b *testing.B) {
+ cfg := testConfig(b)
+ ptr := cfg.config.Types.UInt64.PtrTo()
+ f := cfg.Fun("entry",
+ Bloc("entry",
+ Valu("start", OpInitMem, types.TypeMem, 0, nil),
+ Valu("sb", OpSB, cfg.config.Types.Uintptr, 0, nil),
+ Valu("v", OpConstBool, cfg.config.Types.Bool, 1, nil),
+ Valu("a1", OpAddr, ptr, 0, nil, "sb"),
+ Valu("s1", OpStore, types.TypeMem, 0, cfg.config.Types.Int64, "a1", "v", "start"),
+ Valu("s2", OpStore, types.TypeMem, 0, cfg.config.Types.Bool, "a1", "v", "s1"),
+ Goto("exit")),
+ Bloc("exit",
+ Exit("s2")))
+ cse(f.f)
+ runBench(b, func() {
+ dse(f.f)
+ })
+}
+
+func BenchmarkDeadStoreSmallStructInit(b *testing.B) {
+ cfg := testConfig(b)
+ ptr := cfg.config.Types.BytePtr
+
+ typ := types.NewStruct([]*types.Field{
+ types.NewField(src.NoXPos, &types.Sym{Name: "A"}, cfg.config.Types.Int),
+ types.NewField(src.NoXPos, &types.Sym{Name: "B"}, cfg.config.Types.Int),
+ })
+ tmp := cfg.Temp(typ)
+
+ f := cfg.Fun("entry",
+ Bloc("entry",
+ Valu("start", OpInitMem, types.TypeMem, 0, nil),
+ Valu("sp", OpSP, cfg.config.Types.Uintptr, 0, nil),
+ Valu("zero", OpConst64, cfg.config.Types.Int, 0, nil),
+
+ Valu("v6", OpLocalAddr, ptr, 0, tmp, "sp", "start"),
+ Valu("v3", OpOffPtr, ptr, 8, nil, "v6"),
+ Valu("v22", OpOffPtr, ptr, 0, nil, "v6"),
+ Valu("s1", OpStore, types.TypeMem, 0, cfg.config.Types.Int, "v22", "zero", "start"),
+ Valu("s2", OpStore, types.TypeMem, 0, cfg.config.Types.Int, "v3", "zero", "s1"),
+
+ Valu("v8", OpLocalAddr, ptr, 0, tmp, "sp", "s2"),
+ Valu("v23", OpOffPtr, ptr, 8, nil, "v8"),
+ Valu("v25", OpOffPtr, ptr, 0, nil, "v8"),
+ Valu("s3", OpStore, types.TypeMem, 0, cfg.config.Types.Int, "v25", "zero", "s2"),
+ Valu("s4", OpStore, types.TypeMem, 0, cfg.config.Types.Int, "v23", "zero", "s3"),
+ Goto("exit")),
+ Bloc("exit",
+ Exit("s4")))
+ cse(f.f)
+
+ runBench(b, func() {
+ dse(f.f)
+ })
+}
+
+func BenchmarkDeadStoreLargeBlock(b *testing.B) {
+ // create a very large block with many shadowed stores
+ const (
+ addrCount = 128
+ // first 7 are dead
+ storesPerAddr = 8
+ )
+ cfg := testConfig(b)
+ ptrType := cfg.config.Types.BytePtr
+ boolType := cfg.config.Types.Bool
+
+ items := []interface{}{
+ Valu("start", OpInitMem, types.TypeMem, 0, nil),
+ Valu("sb", OpSB, cfg.config.Types.Uintptr, 0, nil),
+ Valu("v", OpConstBool, boolType, 1, nil),
+ }
+
+ for i := 0; i < addrCount; i++ {
+ items = append(items,
+ Valu(fmt.Sprintf("addr%d", i), OpAddr, ptrType, 0, nil, "sb"),
+ )
+ }
+
+ prev := "start"
+ for round := 0; round < storesPerAddr; round++ {
+ for i := 0; i < addrCount; i++ {
+ store := fmt.Sprintf("s_%03d_%d", i, round)
+ addr := fmt.Sprintf("addr%d", i)
+ items = append(items,
+ Valu(store, OpStore, types.TypeMem, 0, boolType, addr, "v", prev),
+ )
+ prev = store
+ }
+ }
+
+ items = append(items, Goto("exit"))
+ entryBlk := Bloc("entry", items...)
+ exitBlk := Bloc("exit", Exit(prev))
+
+ f := cfg.Fun("stress", entryBlk, exitBlk)
+
+ runBench(b, func() {
+ dse(f.f)
+ })
+}
+
+func runBench(b *testing.B, build func()) {
+ b.ReportAllocs()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ build()
+ }
+}