// SIMD ops
case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
s.Prog(v.Op.Asm())
- case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
- // zero-width, no instruction generated
+
+ case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: // no code emitted
+
+ case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v:
+ // These are for initializing the least 32/64 bits of a SIMD register from a "float".
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = v.Args[0].Reg()
+ p.AddRestSourceReg(x86.REG_X15)
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = simdReg(v)
+
+ case ssa.OpAMD64VMOVD, ssa.OpAMD64VMOVQ:
+ // These are for initializing the least 32/64 bits of a SIMD register from an "int".
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = v.Args[0].Reg()
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = simdReg(v)
+
case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512, ssa.OpAMD64KMOVQload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
(VPANDQ512 x (VPMOVMToVec32x16 k)) => (VMOVDQU32Masked512 x k)
(VPANDQ512 x (VPMOVMToVec16x32 k)) => (VMOVDQU16Masked512 x k)
(VPANDQ512 x (VPMOVMToVec8x64 k)) => (VMOVDQU8Masked512 x k)
+
+// Insert to zero of 32/64 bit floats and ints to a zero is just MOVS[SD]
+(VPINSRQ128 [0] (Zero128 <t>) y) && y.Type.IsFloat() => (VMOVSDf2v <types.TypeVec128> y)
+(VPINSRD128 [0] (Zero128 <t>) y) && y.Type.IsFloat() => (VMOVSSf2v <types.TypeVec128> y)
+(VPINSRQ128 [0] (Zero128 <t>) y) && !y.Type.IsFloat() => (VMOVQ <types.TypeVec128> y)
+(VPINSRD128 [0] (Zero128 <t>) y) && !y.Type.IsFloat() => (VMOVD <types.TypeVec128> y)
+
+// These rewrites can skip zero-extending the 8/16-bit inputs because they are
+// only used as the input to a broadcast; the potentially "bad" bits are ignored
+(VPBROADCASTB(128|256|512) x:(VPINSRB128 [0] (Zero128 <t>) y)) && x.Uses == 1 =>
+ (VPBROADCASTB(128|256|512) (VMOVQ <types.TypeVec128> y))
+(VPBROADCASTW(128|256|512) x:(VPINSRW128 [0] (Zero128 <t>) y)) && x.Uses == 1 =>
+ (VPBROADCASTW(128|256|512) (VMOVQ <types.TypeVec128> y))
+
vgp = regInfo{inputs: vonly, outputs: gponly}
vfpv = regInfo{inputs: []regMask{vz, fp}, outputs: vonly}
vfpkv = regInfo{inputs: []regMask{vz, fp, mask}, outputs: vonly}
+ fpv = regInfo{inputs: []regMask{fp}, outputs: vonly}
+ gpv = regInfo{inputs: []regMask{gp}, outputs: vonly}
w11 = regInfo{inputs: wzonly, outputs: wonly}
w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly}
{name: "Zero256", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
{name: "Zero512", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
+ {name: "VMOVSDf2v", argLength: 1, reg: fpv, asm: "VMOVSD"},
+ {name: "VMOVSSf2v", argLength: 1, reg: fpv, asm: "VMOVSS"},
+ {name: "VMOVQ", argLength: 1, reg: gpv, asm: "VMOVQ"},
+ {name: "VMOVD", argLength: 1, reg: gpv, asm: "VMOVD"},
+
{name: "VZEROUPPER", argLength: 0, asm: "VZEROUPPER"},
{name: "VZEROALL", argLength: 0, asm: "VZEROALL"},
if !reservedNames[name] {
panic(fmt.Sprintf("declReserved call does not use a reserved name: %q", name))
}
- return &Declare{name, exprf(value)}
+ return &Declare{name, exprf("%s", value)}
}
// breakf constructs a simple "if cond { break }" statement, using exprf for its
if vname == "" {
vname = fmt.Sprintf("v_%v", i)
}
- rr.add(declf(rr.Loc, vname, cname))
+ rr.add(declf(rr.Loc, vname, "%s", cname))
p, op := genMatch0(rr, arch, expr, vname, nil, false) // TODO: pass non-nil cnt?
if op != "" {
check := fmt.Sprintf("%s.Op == %s", cname, op)
}
pos[i] = p
} else {
- rr.add(declf(rr.Loc, arg, cname))
+ rr.add(declf(rr.Loc, arg, "%s", cname))
pos[i] = arg + ".Pos"
}
}
OpAMD64Zero128
OpAMD64Zero256
OpAMD64Zero512
+ OpAMD64VMOVSDf2v
+ OpAMD64VMOVSSf2v
+ OpAMD64VMOVQ
+ OpAMD64VMOVD
OpAMD64VZEROUPPER
OpAMD64VZEROALL
OpAMD64KMOVQload
},
},
},
+ {
+ name: "VMOVSDf2v",
+ argLen: 1,
+ asm: x86.AVMOVSD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VMOVSSf2v",
+ argLen: 1,
+ asm: x86.AVMOVSS,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VMOVQ",
+ argLen: 1,
+ asm: x86.AVMOVQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VMOVD",
+ argLen: 1,
+ asm: x86.AVMOVD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
{
name: "VZEROUPPER",
argLen: 0,
return rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v)
case OpAMD64VPANDQ512:
return rewriteValueAMD64_OpAMD64VPANDQ512(v)
+ case OpAMD64VPBROADCASTB128:
+ return rewriteValueAMD64_OpAMD64VPBROADCASTB128(v)
+ case OpAMD64VPBROADCASTB256:
+ return rewriteValueAMD64_OpAMD64VPBROADCASTB256(v)
+ case OpAMD64VPBROADCASTB512:
+ return rewriteValueAMD64_OpAMD64VPBROADCASTB512(v)
+ case OpAMD64VPBROADCASTW128:
+ return rewriteValueAMD64_OpAMD64VPBROADCASTW128(v)
+ case OpAMD64VPBROADCASTW256:
+ return rewriteValueAMD64_OpAMD64VPBROADCASTW256(v)
+ case OpAMD64VPBROADCASTW512:
+ return rewriteValueAMD64_OpAMD64VPBROADCASTW512(v)
+ case OpAMD64VPINSRD128:
+ return rewriteValueAMD64_OpAMD64VPINSRD128(v)
+ case OpAMD64VPINSRQ128:
+ return rewriteValueAMD64_OpAMD64VPINSRQ128(v)
case OpAMD64VPMOVVec16x16ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec16x16ToM(v)
case OpAMD64VPMOVVec16x32ToM:
}
return false
}
+func rewriteValueAMD64_OpAMD64VPBROADCASTB128(v *Value) bool {
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (VPBROADCASTB128 x:(VPINSRB128 [0] (Zero128 <t>) y))
+ // cond: x.Uses == 1
+ // result: (VPBROADCASTB128 (VMOVQ <types.TypeVec128> y))
+ for {
+ x := v_0
+ if x.Op != OpAMD64VPINSRB128 || auxIntToUint8(x.AuxInt) != 0 {
+ break
+ }
+ y := x.Args[1]
+ x_0 := x.Args[0]
+ if x_0.Op != OpAMD64Zero128 {
+ break
+ }
+ if !(x.Uses == 1) {
+ break
+ }
+ v.reset(OpAMD64VPBROADCASTB128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128)
+ v0.AddArg(y)
+ v.AddArg(v0)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64VPBROADCASTB256(v *Value) bool {
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (VPBROADCASTB256 x:(VPINSRB128 [0] (Zero128 <t>) y))
+ // cond: x.Uses == 1
+ // result: (VPBROADCASTB256 (VMOVQ <types.TypeVec128> y))
+ for {
+ x := v_0
+ if x.Op != OpAMD64VPINSRB128 || auxIntToUint8(x.AuxInt) != 0 {
+ break
+ }
+ y := x.Args[1]
+ x_0 := x.Args[0]
+ if x_0.Op != OpAMD64Zero128 {
+ break
+ }
+ if !(x.Uses == 1) {
+ break
+ }
+ v.reset(OpAMD64VPBROADCASTB256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128)
+ v0.AddArg(y)
+ v.AddArg(v0)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64VPBROADCASTB512(v *Value) bool {
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (VPBROADCASTB512 x:(VPINSRB128 [0] (Zero128 <t>) y))
+ // cond: x.Uses == 1
+ // result: (VPBROADCASTB512 (VMOVQ <types.TypeVec128> y))
+ for {
+ x := v_0
+ if x.Op != OpAMD64VPINSRB128 || auxIntToUint8(x.AuxInt) != 0 {
+ break
+ }
+ y := x.Args[1]
+ x_0 := x.Args[0]
+ if x_0.Op != OpAMD64Zero128 {
+ break
+ }
+ if !(x.Uses == 1) {
+ break
+ }
+ v.reset(OpAMD64VPBROADCASTB512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128)
+ v0.AddArg(y)
+ v.AddArg(v0)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64VPBROADCASTW128(v *Value) bool {
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (VPBROADCASTW128 x:(VPINSRW128 [0] (Zero128 <t>) y))
+ // cond: x.Uses == 1
+ // result: (VPBROADCASTW128 (VMOVQ <types.TypeVec128> y))
+ for {
+ x := v_0
+ if x.Op != OpAMD64VPINSRW128 || auxIntToUint8(x.AuxInt) != 0 {
+ break
+ }
+ y := x.Args[1]
+ x_0 := x.Args[0]
+ if x_0.Op != OpAMD64Zero128 {
+ break
+ }
+ if !(x.Uses == 1) {
+ break
+ }
+ v.reset(OpAMD64VPBROADCASTW128)
+ v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128)
+ v0.AddArg(y)
+ v.AddArg(v0)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64VPBROADCASTW256(v *Value) bool {
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (VPBROADCASTW256 x:(VPINSRW128 [0] (Zero128 <t>) y))
+ // cond: x.Uses == 1
+ // result: (VPBROADCASTW256 (VMOVQ <types.TypeVec128> y))
+ for {
+ x := v_0
+ if x.Op != OpAMD64VPINSRW128 || auxIntToUint8(x.AuxInt) != 0 {
+ break
+ }
+ y := x.Args[1]
+ x_0 := x.Args[0]
+ if x_0.Op != OpAMD64Zero128 {
+ break
+ }
+ if !(x.Uses == 1) {
+ break
+ }
+ v.reset(OpAMD64VPBROADCASTW256)
+ v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128)
+ v0.AddArg(y)
+ v.AddArg(v0)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64VPBROADCASTW512(v *Value) bool {
+ v_0 := v.Args[0]
+ b := v.Block
+ // match: (VPBROADCASTW512 x:(VPINSRW128 [0] (Zero128 <t>) y))
+ // cond: x.Uses == 1
+ // result: (VPBROADCASTW512 (VMOVQ <types.TypeVec128> y))
+ for {
+ x := v_0
+ if x.Op != OpAMD64VPINSRW128 || auxIntToUint8(x.AuxInt) != 0 {
+ break
+ }
+ y := x.Args[1]
+ x_0 := x.Args[0]
+ if x_0.Op != OpAMD64Zero128 {
+ break
+ }
+ if !(x.Uses == 1) {
+ break
+ }
+ v.reset(OpAMD64VPBROADCASTW512)
+ v0 := b.NewValue0(v.Pos, OpAMD64VMOVQ, types.TypeVec128)
+ v0.AddArg(y)
+ v.AddArg(v0)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64VPINSRD128(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (VPINSRD128 [0] (Zero128 <t>) y)
+ // cond: y.Type.IsFloat()
+ // result: (VMOVSSf2v <types.TypeVec128> y)
+ for {
+ if auxIntToUint8(v.AuxInt) != 0 || v_0.Op != OpAMD64Zero128 {
+ break
+ }
+ y := v_1
+ if !(y.Type.IsFloat()) {
+ break
+ }
+ v.reset(OpAMD64VMOVSSf2v)
+ v.Type = types.TypeVec128
+ v.AddArg(y)
+ return true
+ }
+ // match: (VPINSRD128 [0] (Zero128 <t>) y)
+ // cond: !y.Type.IsFloat()
+ // result: (VMOVD <types.TypeVec128> y)
+ for {
+ if auxIntToUint8(v.AuxInt) != 0 || v_0.Op != OpAMD64Zero128 {
+ break
+ }
+ y := v_1
+ if !(!y.Type.IsFloat()) {
+ break
+ }
+ v.reset(OpAMD64VMOVD)
+ v.Type = types.TypeVec128
+ v.AddArg(y)
+ return true
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64VPINSRQ128(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (VPINSRQ128 [0] (Zero128 <t>) y)
+ // cond: y.Type.IsFloat()
+ // result: (VMOVSDf2v <types.TypeVec128> y)
+ for {
+ if auxIntToUint8(v.AuxInt) != 0 || v_0.Op != OpAMD64Zero128 {
+ break
+ }
+ y := v_1
+ if !(y.Type.IsFloat()) {
+ break
+ }
+ v.reset(OpAMD64VMOVSDf2v)
+ v.Type = types.TypeVec128
+ v.AddArg(y)
+ return true
+ }
+ // match: (VPINSRQ128 [0] (Zero128 <t>) y)
+ // cond: !y.Type.IsFloat()
+ // result: (VMOVQ <types.TypeVec128> y)
+ for {
+ if auxIntToUint8(v.AuxInt) != 0 || v_0.Op != OpAMD64Zero128 {
+ break
+ }
+ y := v_1
+ if !(!y.Type.IsFloat()) {
+ break
+ }
+ v.reset(OpAMD64VMOVQ)
+ v.Type = types.TypeVec128
+ v.AddArg(y)
+ return true
+ }
+ return false
+}
func rewriteValueAMD64_OpAMD64VPMOVVec16x16ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec16x16ToM (VPMOVMToVec16x16 x))
checkSlices(t, s, []uint64{123456789, 123456789})
}
+func TestBroadcastUint16x8(t *testing.T) {
+ s := make([]uint16, 8, 8)
+ simd.BroadcastUint16x8(12345).StoreSlice(s)
+ checkSlices(t, s, []uint16{12345, 12345, 12345, 12345})
+}
+
+func TestBroadcastInt8x32(t *testing.T) {
+ s := make([]int8, 32, 32)
+ simd.BroadcastInt8x32(-123).StoreSlice(s)
+ checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
+ -123, -123, -123, -123, -123, -123, -123, -123,
+ -123, -123, -123, -123, -123, -123, -123, -123,
+ -123, -123, -123, -123, -123, -123, -123, -123,
+ })
+}
+
func TestMaskOpt512(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")