From f0e281e693685954df71374c9a9fb856e8745519 Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Fri, 26 Sep 2025 14:38:22 -0400 Subject: [PATCH] [dev.simd] cmd/compile: don't require single use for SIMD load/store folding For load and store on scalar values, we fold the address to the load/stoer instruction without requiring the address having one use. Do the same for SIMD, and remove the single use check. Change-Id: Ie7d1bbae1b32bb8c069548197632edae36b419b9 Reviewed-on: https://go-review.googlesource.com/c/go/+/707137 Reviewed-by: Junyang Shao Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/ssa/_gen/AMD64.rules | 8 ++-- src/cmd/compile/internal/ssa/rewriteAMD64.go | 48 +++++++++---------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 05fc64d486..3689c12411 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1798,10 +1798,10 @@ (VMOVSDf2v x:(MOVSDconst [c] )) => (VMOVSDconst [c] ) (VMOVSSf2v x:(MOVSSconst [c] )) => (VMOVSSconst [c] ) -(VMOVDQUload(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 => (VMOVDQUload(128|256|512) [off1+off2] {sym} ptr mem) -(VMOVDQUstore(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 => (VMOVDQUstore(128|256|512) [off1+off2] {sym} ptr val mem) -(VMOVDQUload(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUload(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base mem) -(VMOVDQUstore(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUstore(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base val mem) +(VMOVDQUload(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => (VMOVDQUload(128|256|512) [off1+off2] {sym} ptr mem) +(VMOVDQUstore(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) => (VMOVDQUstore(128|256|512) [off1+off2] {sym} ptr val mem) +(VMOVDQUload(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => (VMOVDQUload(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base mem) +(VMOVDQUstore(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => (VMOVDQUstore(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base val mem) // 2-op VPTEST optimizations (SETEQ (VPTEST x:(VPAND(128|256) j k) y)) && x == y && x.Uses == 2 => (SETEQ (VPTEST j k)) diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 747b337192..ca9f9ae17b 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -33295,7 +33295,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload128(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] // match: (VMOVDQUload128 [off1] {sym} x:(ADDQconst [off2] ptr) mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // cond: is32Bit(int64(off1)+int64(off2)) // result: (VMOVDQUload128 [off1+off2] {sym} ptr mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33307,7 +33307,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload128(v *Value) bool { off2 := auxIntToInt32(x.AuxInt) ptr := x.Args[0] mem := v_1 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + if !(is32Bit(int64(off1) + int64(off2))) { break } v.reset(OpAMD64VMOVDQUload128) @@ -33317,7 +33317,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload128(v *Value) bool { return true } // match: (VMOVDQUload128 [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) // result: (VMOVDQUload128 [off1+off2] {mergeSym(sym1, sym2)} base mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33330,7 +33330,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload128(v *Value) bool { sym2 := auxToSym(x.Aux) base := x.Args[0] mem := v_1 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { break } v.reset(OpAMD64VMOVDQUload128) @@ -33345,7 +33345,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload256(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] // match: (VMOVDQUload256 [off1] {sym} x:(ADDQconst [off2] ptr) mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // cond: is32Bit(int64(off1)+int64(off2)) // result: (VMOVDQUload256 [off1+off2] {sym} ptr mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33357,7 +33357,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload256(v *Value) bool { off2 := auxIntToInt32(x.AuxInt) ptr := x.Args[0] mem := v_1 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + if !(is32Bit(int64(off1) + int64(off2))) { break } v.reset(OpAMD64VMOVDQUload256) @@ -33367,7 +33367,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload256(v *Value) bool { return true } // match: (VMOVDQUload256 [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) // result: (VMOVDQUload256 [off1+off2] {mergeSym(sym1, sym2)} base mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33380,7 +33380,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload256(v *Value) bool { sym2 := auxToSym(x.Aux) base := x.Args[0] mem := v_1 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { break } v.reset(OpAMD64VMOVDQUload256) @@ -33395,7 +33395,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] // match: (VMOVDQUload512 [off1] {sym} x:(ADDQconst [off2] ptr) mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // cond: is32Bit(int64(off1)+int64(off2)) // result: (VMOVDQUload512 [off1+off2] {sym} ptr mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33407,7 +33407,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload512(v *Value) bool { off2 := auxIntToInt32(x.AuxInt) ptr := x.Args[0] mem := v_1 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + if !(is32Bit(int64(off1) + int64(off2))) { break } v.reset(OpAMD64VMOVDQUload512) @@ -33417,7 +33417,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload512(v *Value) bool { return true } // match: (VMOVDQUload512 [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) // result: (VMOVDQUload512 [off1+off2] {mergeSym(sym1, sym2)} base mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33430,7 +33430,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUload512(v *Value) bool { sym2 := auxToSym(x.Aux) base := x.Args[0] mem := v_1 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { break } v.reset(OpAMD64VMOVDQUload512) @@ -33446,7 +33446,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore128(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] // match: (VMOVDQUstore128 [off1] {sym} x:(ADDQconst [off2] ptr) val mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // cond: is32Bit(int64(off1)+int64(off2)) // result: (VMOVDQUstore128 [off1+off2] {sym} ptr val mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33459,7 +33459,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore128(v *Value) bool { ptr := x.Args[0] val := v_1 mem := v_2 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + if !(is32Bit(int64(off1) + int64(off2))) { break } v.reset(OpAMD64VMOVDQUstore128) @@ -33469,7 +33469,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore128(v *Value) bool { return true } // match: (VMOVDQUstore128 [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) // result: (VMOVDQUstore128 [off1+off2] {mergeSym(sym1, sym2)} base val mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33483,7 +33483,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore128(v *Value) bool { base := x.Args[0] val := v_1 mem := v_2 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { break } v.reset(OpAMD64VMOVDQUstore128) @@ -33499,7 +33499,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore256(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] // match: (VMOVDQUstore256 [off1] {sym} x:(ADDQconst [off2] ptr) val mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // cond: is32Bit(int64(off1)+int64(off2)) // result: (VMOVDQUstore256 [off1+off2] {sym} ptr val mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33512,7 +33512,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore256(v *Value) bool { ptr := x.Args[0] val := v_1 mem := v_2 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + if !(is32Bit(int64(off1) + int64(off2))) { break } v.reset(OpAMD64VMOVDQUstore256) @@ -33522,7 +33522,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore256(v *Value) bool { return true } // match: (VMOVDQUstore256 [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) // result: (VMOVDQUstore256 [off1+off2] {mergeSym(sym1, sym2)} base val mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33536,7 +33536,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore256(v *Value) bool { base := x.Args[0] val := v_1 mem := v_2 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { break } v.reset(OpAMD64VMOVDQUstore256) @@ -33552,7 +33552,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] // match: (VMOVDQUstore512 [off1] {sym} x:(ADDQconst [off2] ptr) val mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // cond: is32Bit(int64(off1)+int64(off2)) // result: (VMOVDQUstore512 [off1+off2] {sym} ptr val mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33565,7 +33565,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore512(v *Value) bool { ptr := x.Args[0] val := v_1 mem := v_2 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + if !(is32Bit(int64(off1) + int64(off2))) { break } v.reset(OpAMD64VMOVDQUstore512) @@ -33575,7 +33575,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore512(v *Value) bool { return true } // match: (VMOVDQUstore512 [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) - // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // cond: is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) // result: (VMOVDQUstore512 [off1+off2] {mergeSym(sym1, sym2)} base val mem) for { off1 := auxIntToInt32(v.AuxInt) @@ -33589,7 +33589,7 @@ func rewriteValueAMD64_OpAMD64VMOVDQUstore512(v *Value) bool { base := x.Args[0] val := v_1 mem := v_2 - if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + if !(is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)) { break } v.reset(OpAMD64VMOVDQUstore512) -- 2.52.0