The added test are manually checked that the peepholes are triggered.
Change-Id: Ibd29eac449869b52c2376f9eafd83410b5266890
Reviewed-on: https://go-review.googlesource.com/c/go/+/710916
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
- case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512, ssa.OpAMD64KMOVQload:
+ case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512,
+ ssa.OpAMD64KMOVBload, ssa.OpAMD64KMOVWload, ssa.OpAMD64KMOVDload, ssa.OpAMD64KMOVQload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdOrMaskReg(v)
- case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512, ssa.OpAMD64KMOVQstore:
+ case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512,
+ ssa.OpAMD64KMOVBstore, ssa.OpAMD64KMOVWstore, ssa.OpAMD64KMOVDstore, ssa.OpAMD64KMOVQstore:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdOrMaskReg(v.Args[1])
(CvtMask64x4to8 <t> x) => (KMOVBi <t> (VPMOVVec64x4ToM <types.TypeMask> x))
(CvtMask64x8to8 <t> x) => (KMOVBi <t> (VPMOVVec64x8ToM <types.TypeMask> x))
+// optimizations
+(MOVBstore [off] {sym} ptr (KMOVBi mask) mem) => (KMOVBstore [off] {sym} ptr mask mem)
+(MOVWstore [off] {sym} ptr (KMOVWi mask) mem) => (KMOVWstore [off] {sym} ptr mask mem)
+(MOVLstore [off] {sym} ptr (KMOVDi mask) mem) => (KMOVDstore [off] {sym} ptr mask mem)
+(MOVQstore [off] {sym} ptr (KMOVQi mask) mem) => (KMOVQstore [off] {sym} ptr mask mem)
+
+(KMOVBk l:(MOVBload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVBload [off] {sym} ptr mem)
+(KMOVWk l:(MOVWload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVWload [off] {sym} ptr mem)
+(KMOVDk l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVDload [off] {sym} ptr mem)
+(KMOVQk l:(MOVQload [off] {sym} ptr mem)) && canMergeLoad(v, l) && clobber(l) => (KMOVQload [off] {sym} ptr mem)
+
// SIMD vector loads and stores
(Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem)
(Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem)
{name: "VZEROUPPER", argLength: 1, reg: regInfo{clobbers: v}, asm: "VZEROUPPER"}, // arg=mem, returns mem
{name: "VZEROALL", argLength: 1, reg: regInfo{clobbers: v}, asm: "VZEROALL"}, // arg=mem, returns mem
+ // KMOVxload: loads masks
+ // Load (Q=8,D=4,W=2,B=1) bytes from (arg0+auxint+aux), arg1=mem.
+ // "+auxint+aux" == add auxint and the offset of the symbol in aux (if any) to the effective address
+ {name: "KMOVBload", argLength: 2, reg: kload, asm: "KMOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "KMOVWload", argLength: 2, reg: kload, asm: "KMOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+ {name: "KMOVDload", argLength: 2, reg: kload, asm: "KMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
{name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
+
+ // KMOVxstore: stores masks
+ // Store (Q=8,D=4,W=2,B=1) low bytes of arg1.
+ // Does *(arg0+auxint+aux) = arg1, arg2=mem.
+ {name: "KMOVBstore", argLength: 3, reg: kstore, asm: "KMOVB", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "KMOVWstore", argLength: 3, reg: kstore, asm: "KMOVW", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
+ {name: "KMOVDstore", argLength: 3, reg: kstore, asm: "KMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
{name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
// Move GP directly to mask register
OpAMD64VMOVSDconst
OpAMD64VZEROUPPER
OpAMD64VZEROALL
+ OpAMD64KMOVBload
+ OpAMD64KMOVWload
+ OpAMD64KMOVDload
OpAMD64KMOVQload
+ OpAMD64KMOVBstore
+ OpAMD64KMOVWstore
+ OpAMD64KMOVDstore
OpAMD64KMOVQstore
OpAMD64KMOVQk
OpAMD64KMOVDk
clobbers: 2147418112, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
+ {
+ name: "KMOVBload",
+ auxType: auxSymOff,
+ argLen: 2,
+ faultOnNilArg0: true,
+ symEffect: SymRead,
+ asm: x86.AKMOVB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+ },
+ outputs: []outputInfo{
+ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ },
+ },
+ },
+ {
+ name: "KMOVWload",
+ auxType: auxSymOff,
+ argLen: 2,
+ faultOnNilArg0: true,
+ symEffect: SymRead,
+ asm: x86.AKMOVW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+ },
+ outputs: []outputInfo{
+ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ },
+ },
+ },
+ {
+ name: "KMOVDload",
+ auxType: auxSymOff,
+ argLen: 2,
+ faultOnNilArg0: true,
+ symEffect: SymRead,
+ asm: x86.AKMOVD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+ },
+ outputs: []outputInfo{
+ {0, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ },
+ },
+ },
{
name: "KMOVQload",
auxType: auxSymOff,
},
},
},
+ {
+ name: "KMOVBstore",
+ auxType: auxSymOff,
+ argLen: 3,
+ faultOnNilArg0: true,
+ symEffect: SymWrite,
+ asm: x86.AKMOVB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+ },
+ },
+ },
+ {
+ name: "KMOVWstore",
+ auxType: auxSymOff,
+ argLen: 3,
+ faultOnNilArg0: true,
+ symEffect: SymWrite,
+ asm: x86.AKMOVW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+ },
+ },
+ },
+ {
+ name: "KMOVDstore",
+ auxType: auxSymOff,
+ argLen: 3,
+ faultOnNilArg0: true,
+ symEffect: SymWrite,
+ asm: x86.AKMOVD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
+ },
+ },
+ },
{
name: "KMOVQstore",
auxType: auxSymOff,
return rewriteValueAMD64_OpAMD64HMULQ(v)
case OpAMD64HMULQU:
return rewriteValueAMD64_OpAMD64HMULQU(v)
+ case OpAMD64KMOVBk:
+ return rewriteValueAMD64_OpAMD64KMOVBk(v)
+ case OpAMD64KMOVDk:
+ return rewriteValueAMD64_OpAMD64KMOVDk(v)
+ case OpAMD64KMOVQk:
+ return rewriteValueAMD64_OpAMD64KMOVQk(v)
+ case OpAMD64KMOVWk:
+ return rewriteValueAMD64_OpAMD64KMOVWk(v)
case OpAMD64LEAL:
return rewriteValueAMD64_OpAMD64LEAL(v)
case OpAMD64LEAL1:
}
return false
}
+func rewriteValueAMD64_OpAMD64KMOVBk(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (KMOVBk l:(MOVBload [off] {sym} ptr mem))
+ // cond: canMergeLoad(v, l) && clobber(l)
+ // result: (KMOVBload [off] {sym} ptr mem)
+ for {
+ l := v_0
+ if l.Op != OpAMD64MOVBload {
+ break
+ }
+ off := auxIntToInt32(l.AuxInt)
+ sym := auxToSym(l.Aux)
+ mem := l.Args[1]
+ ptr := l.Args[0]
+ if !(canMergeLoad(v, l) && clobber(l)) {
+ break
+ }
+ v.reset(OpAMD64KMOVBload)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg2(ptr, mem)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64KMOVDk(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (KMOVDk l:(MOVLload [off] {sym} ptr mem))
+ // cond: canMergeLoad(v, l) && clobber(l)
+ // result: (KMOVDload [off] {sym} ptr mem)
+ for {
+ l := v_0
+ if l.Op != OpAMD64MOVLload {
+ break
+ }
+ off := auxIntToInt32(l.AuxInt)
+ sym := auxToSym(l.Aux)
+ mem := l.Args[1]
+ ptr := l.Args[0]
+ if !(canMergeLoad(v, l) && clobber(l)) {
+ break
+ }
+ v.reset(OpAMD64KMOVDload)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg2(ptr, mem)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64KMOVQk(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (KMOVQk l:(MOVQload [off] {sym} ptr mem))
+ // cond: canMergeLoad(v, l) && clobber(l)
+ // result: (KMOVQload [off] {sym} ptr mem)
+ for {
+ l := v_0
+ if l.Op != OpAMD64MOVQload {
+ break
+ }
+ off := auxIntToInt32(l.AuxInt)
+ sym := auxToSym(l.Aux)
+ mem := l.Args[1]
+ ptr := l.Args[0]
+ if !(canMergeLoad(v, l) && clobber(l)) {
+ break
+ }
+ v.reset(OpAMD64KMOVQload)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg2(ptr, mem)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64KMOVWk(v *Value) bool {
+ v_0 := v.Args[0]
+ // match: (KMOVWk l:(MOVWload [off] {sym} ptr mem))
+ // cond: canMergeLoad(v, l) && clobber(l)
+ // result: (KMOVWload [off] {sym} ptr mem)
+ for {
+ l := v_0
+ if l.Op != OpAMD64MOVWload {
+ break
+ }
+ off := auxIntToInt32(l.AuxInt)
+ sym := auxToSym(l.Aux)
+ mem := l.Args[1]
+ ptr := l.Args[0]
+ if !(canMergeLoad(v, l) && clobber(l)) {
+ break
+ }
+ v.reset(OpAMD64KMOVWload)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg2(ptr, mem)
+ return true
+ }
+ return false
+}
func rewriteValueAMD64_OpAMD64LEAL(v *Value) bool {
v_0 := v.Args[0]
// match: (LEAL [c] {s} (ADDLconst [d] x))
v.AddArg3(base, val, mem)
return true
}
+ // match: (MOVBstore [off] {sym} ptr (KMOVBi mask) mem)
+ // result: (KMOVBstore [off] {sym} ptr mask mem)
+ for {
+ off := auxIntToInt32(v.AuxInt)
+ sym := auxToSym(v.Aux)
+ ptr := v_0
+ if v_1.Op != OpAMD64KMOVBi {
+ break
+ }
+ mask := v_1.Args[0]
+ mem := v_2
+ v.reset(OpAMD64KMOVBstore)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg3(ptr, mask, mem)
+ return true
+ }
return false
}
func rewriteValueAMD64_OpAMD64MOVBstoreconst(v *Value) bool {
v.AddArg3(p, w, mem)
return true
}
+ // match: (MOVLstore [off] {sym} ptr (KMOVDi mask) mem)
+ // result: (KMOVDstore [off] {sym} ptr mask mem)
+ for {
+ off := auxIntToInt32(v.AuxInt)
+ sym := auxToSym(v.Aux)
+ ptr := v_0
+ if v_1.Op != OpAMD64KMOVDi {
+ break
+ }
+ mask := v_1.Args[0]
+ mem := v_2
+ v.reset(OpAMD64KMOVDstore)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg3(ptr, mask, mem)
+ return true
+ }
return false
}
func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool {
v.AddArg3(p, w, mem)
return true
}
+ // match: (MOVQstore [off] {sym} ptr (KMOVQi mask) mem)
+ // result: (KMOVQstore [off] {sym} ptr mask mem)
+ for {
+ off := auxIntToInt32(v.AuxInt)
+ sym := auxToSym(v.Aux)
+ ptr := v_0
+ if v_1.Op != OpAMD64KMOVQi {
+ break
+ }
+ mask := v_1.Args[0]
+ mem := v_2
+ v.reset(OpAMD64KMOVQstore)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg3(ptr, mask, mem)
+ return true
+ }
return false
}
func rewriteValueAMD64_OpAMD64MOVQstoreconst(v *Value) bool {
v.AddArg3(p, w, mem)
return true
}
+ // match: (MOVWstore [off] {sym} ptr (KMOVWi mask) mem)
+ // result: (KMOVWstore [off] {sym} ptr mask mem)
+ for {
+ off := auxIntToInt32(v.AuxInt)
+ sym := auxToSym(v.Aux)
+ ptr := v_0
+ if v_1.Op != OpAMD64KMOVWi {
+ break
+ }
+ mask := v_1.Args[0]
+ mem := v_2
+ v.reset(OpAMD64KMOVWstore)
+ v.AuxInt = int32ToAuxInt(off)
+ v.Aux = symToAux(sym)
+ v.AddArg3(ptr, mask, mem)
+ return true
+ }
return false
}
func rewriteValueAMD64_OpAMD64MOVWstoreconst(v *Value) bool {
}
}
+var maskForTestBitMaskFromBitsLoad = uint8(0b10)
+
+func TestBitMaskFromBitsLoad(t *testing.T) {
+ if !simd.HasAVX512() {
+ t.Skip("Test requires HasAVX512, not available on this hardware")
+ return
+ }
+ results := [2]int64{}
+ want := [2]int64{0, 6}
+ m := simd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
+ simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
+ for i := range 2 {
+ if results[i] != want[i] {
+ t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
+ }
+ }
+}
+
func TestBitMaskToBits(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
}
}
+var maskForTestBitMaskFromBitsStore uint8
+
+func TestBitMaskToBitsStore(t *testing.T) {
+ if !simd.HasAVX512() {
+ t.Skip("Test requires HasAVX512, not available on this hardware")
+ return
+ }
+ maskForTestBitMaskFromBitsStore = simd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
+ if maskForTestBitMaskFromBitsStore != 0b101 {
+ t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
+ }
+}
+
func TestMergeFloat(t *testing.T) {
k := make([]int64, 4, 4)
s := make([]float64, 4, 4)