From 1e5631d4e0caddbf46ba61debb95fa9dce67ccbe Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Thu, 11 Sep 2025 19:43:48 +0000 Subject: [PATCH] [dev.simd] cmd/compile: peephole simd load Some convenient peepholes, might not have big impact on performances. Change-Id: I25574dba95fcf1d5fda14472175e556737b51584 Reviewed-on: https://go-review.googlesource.com/c/go/+/702997 Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/ssa/_gen/AMD64.rules | 4 + src/cmd/compile/internal/ssa/rewriteAMD64.go | 321 ++++++++++++++++++ 2 files changed, 325 insertions(+) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 2300cc3757..ad84ba7555 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1791,3 +1791,7 @@ (VMOVSDf2v x:(MOVSDconst [c] )) => (VMOVSDconst [c] ) (VMOVSSf2v x:(MOVSSconst [c] )) => (VMOVSSconst [c] ) +(VMOVDQUload(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 => (VMOVDQUload(128|256|512) [off1+off2] {sym} ptr mem) +(VMOVDQUstore(128|256|512) [off1] {sym} x:(ADDQconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 => (VMOVDQUstore(128|256|512) [off1+off2] {sym} ptr val mem) +(VMOVDQUload(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUload(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base mem) +(VMOVDQUstore(128|256|512) [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) => (VMOVDQUstore(128|256|512) [off1+off2] {mergeSym(sym1, sym2)} base val mem) diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 737b0c4762..d705b92003 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -709,6 +709,18 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v) case OpAMD64VMOVDQU8Masked512: return rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v) + case OpAMD64VMOVDQUload128: + return rewriteValueAMD64_OpAMD64VMOVDQUload128(v) + case OpAMD64VMOVDQUload256: + return rewriteValueAMD64_OpAMD64VMOVDQUload256(v) + case OpAMD64VMOVDQUload512: + return rewriteValueAMD64_OpAMD64VMOVDQUload512(v) + case OpAMD64VMOVDQUstore128: + return rewriteValueAMD64_OpAMD64VMOVDQUstore128(v) + case OpAMD64VMOVDQUstore256: + return rewriteValueAMD64_OpAMD64VMOVDQUstore256(v) + case OpAMD64VMOVDQUstore512: + return rewriteValueAMD64_OpAMD64VMOVDQUstore512(v) case OpAMD64VMOVQ: return rewriteValueAMD64_OpAMD64VMOVQ(v) case OpAMD64VMOVSDf2v: @@ -32833,6 +32845,315 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VMOVDQUload128(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQUload128 [off1] {sym} x:(ADDQconst [off2] ptr) mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // result: (VMOVDQUload128 [off1+off2] {sym} ptr mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64ADDQconst { + break + } + off2 := auxIntToInt32(x.AuxInt) + ptr := x.Args[0] + mem := v_1 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + break + } + v.reset(OpAMD64VMOVDQUload128) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(sym) + v.AddArg2(ptr, mem) + return true + } + // match: (VMOVDQUload128 [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // result: (VMOVDQUload128 [off1+off2] {mergeSym(sym1, sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64LEAQ { + break + } + off2 := auxIntToInt32(x.AuxInt) + sym2 := auxToSym(x.Aux) + base := x.Args[0] + mem := v_1 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + break + } + v.reset(OpAMD64VMOVDQUload128) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) + v.AddArg2(base, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVDQUload256(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQUload256 [off1] {sym} x:(ADDQconst [off2] ptr) mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // result: (VMOVDQUload256 [off1+off2] {sym} ptr mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64ADDQconst { + break + } + off2 := auxIntToInt32(x.AuxInt) + ptr := x.Args[0] + mem := v_1 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + break + } + v.reset(OpAMD64VMOVDQUload256) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(sym) + v.AddArg2(ptr, mem) + return true + } + // match: (VMOVDQUload256 [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // result: (VMOVDQUload256 [off1+off2] {mergeSym(sym1, sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64LEAQ { + break + } + off2 := auxIntToInt32(x.AuxInt) + sym2 := auxToSym(x.Aux) + base := x.Args[0] + mem := v_1 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + break + } + v.reset(OpAMD64VMOVDQUload256) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) + v.AddArg2(base, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVDQUload512(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQUload512 [off1] {sym} x:(ADDQconst [off2] ptr) mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // result: (VMOVDQUload512 [off1+off2] {sym} ptr mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64ADDQconst { + break + } + off2 := auxIntToInt32(x.AuxInt) + ptr := x.Args[0] + mem := v_1 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + break + } + v.reset(OpAMD64VMOVDQUload512) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(sym) + v.AddArg2(ptr, mem) + return true + } + // match: (VMOVDQUload512 [off1] {sym1} x:(LEAQ [off2] {sym2} base) mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // result: (VMOVDQUload512 [off1+off2] {mergeSym(sym1, sym2)} base mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64LEAQ { + break + } + off2 := auxIntToInt32(x.AuxInt) + sym2 := auxToSym(x.Aux) + base := x.Args[0] + mem := v_1 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + break + } + v.reset(OpAMD64VMOVDQUload512) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) + v.AddArg2(base, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVDQUstore128(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQUstore128 [off1] {sym} x:(ADDQconst [off2] ptr) val mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // result: (VMOVDQUstore128 [off1+off2] {sym} ptr val mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64ADDQconst { + break + } + off2 := auxIntToInt32(x.AuxInt) + ptr := x.Args[0] + val := v_1 + mem := v_2 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + break + } + v.reset(OpAMD64VMOVDQUstore128) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(sym) + v.AddArg3(ptr, val, mem) + return true + } + // match: (VMOVDQUstore128 [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // result: (VMOVDQUstore128 [off1+off2] {mergeSym(sym1, sym2)} base val mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64LEAQ { + break + } + off2 := auxIntToInt32(x.AuxInt) + sym2 := auxToSym(x.Aux) + base := x.Args[0] + val := v_1 + mem := v_2 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + break + } + v.reset(OpAMD64VMOVDQUstore128) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) + v.AddArg3(base, val, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVDQUstore256(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQUstore256 [off1] {sym} x:(ADDQconst [off2] ptr) val mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // result: (VMOVDQUstore256 [off1+off2] {sym} ptr val mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64ADDQconst { + break + } + off2 := auxIntToInt32(x.AuxInt) + ptr := x.Args[0] + val := v_1 + mem := v_2 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + break + } + v.reset(OpAMD64VMOVDQUstore256) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(sym) + v.AddArg3(ptr, val, mem) + return true + } + // match: (VMOVDQUstore256 [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // result: (VMOVDQUstore256 [off1+off2] {mergeSym(sym1, sym2)} base val mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64LEAQ { + break + } + off2 := auxIntToInt32(x.AuxInt) + sym2 := auxToSym(x.Aux) + base := x.Args[0] + val := v_1 + mem := v_2 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + break + } + v.reset(OpAMD64VMOVDQUstore256) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) + v.AddArg3(base, val, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVDQUstore512(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQUstore512 [off1] {sym} x:(ADDQconst [off2] ptr) val mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 + // result: (VMOVDQUstore512 [off1+off2] {sym} ptr val mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64ADDQconst { + break + } + off2 := auxIntToInt32(x.AuxInt) + ptr := x.Args[0] + val := v_1 + mem := v_2 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1) { + break + } + v.reset(OpAMD64VMOVDQUstore512) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(sym) + v.AddArg3(ptr, val, mem) + return true + } + // match: (VMOVDQUstore512 [off1] {sym1} x:(LEAQ [off2] {sym2} base) val mem) + // cond: is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2) + // result: (VMOVDQUstore512 [off1+off2] {mergeSym(sym1, sym2)} base val mem) + for { + off1 := auxIntToInt32(v.AuxInt) + sym1 := auxToSym(v.Aux) + x := v_0 + if x.Op != OpAMD64LEAQ { + break + } + off2 := auxIntToInt32(x.AuxInt) + sym2 := auxToSym(x.Aux) + base := x.Args[0] + val := v_1 + mem := v_2 + if !(is32Bit(int64(off1)+int64(off2)) && x.Uses == 1 && canMergeSym(sym1, sym2)) { + break + } + v.reset(OpAMD64VMOVDQUstore512) + v.AuxInt = int32ToAuxInt(off1 + off2) + v.Aux = symToAux(mergeSym(sym1, sym2)) + v.AddArg3(base, val, mem) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VMOVQ(v *Value) bool { v_0 := v.Args[0] b := v.Block -- 2.52.0