return rewriteValueAMD64_OpAMD64VPOPCNTQMasked256(v)
case OpAMD64VPOPCNTQMasked512:
return rewriteValueAMD64_OpAMD64VPOPCNTQMasked512(v)
+ case OpAMD64VPOR128:
+ return rewriteValueAMD64_OpAMD64VPOR128(v)
+ case OpAMD64VPOR256:
+ return rewriteValueAMD64_OpAMD64VPOR256(v)
case OpAMD64VPORD512:
return rewriteValueAMD64_OpAMD64VPORD512(v)
case OpAMD64VPORDMasked128:
}
return false
}
+func rewriteValueAMD64_OpAMD64VPOR128(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (VPOR128 (VCMPPS128 [3] x x) (VCMPPS128 [3] y y))
+ // result: (VCMPPS128 [3] x y)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ if v_0.Op != OpAMD64VCMPPS128 || auxIntToUint8(v_0.AuxInt) != 3 {
+ continue
+ }
+ x := v_0.Args[1]
+ if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPS128 || auxIntToUint8(v_1.AuxInt) != 3 {
+ continue
+ }
+ y := v_1.Args[1]
+ if y != v_1.Args[0] {
+ continue
+ }
+ v.reset(OpAMD64VCMPPS128)
+ v.AuxInt = uint8ToAuxInt(3)
+ v.AddArg2(x, y)
+ return true
+ }
+ break
+ }
+ // match: (VPOR128 (VCMPPD128 [3] x x) (VCMPPD128 [3] y y))
+ // result: (VCMPPD128 [3] x y)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ if v_0.Op != OpAMD64VCMPPD128 || auxIntToUint8(v_0.AuxInt) != 3 {
+ continue
+ }
+ x := v_0.Args[1]
+ if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPD128 || auxIntToUint8(v_1.AuxInt) != 3 {
+ continue
+ }
+ y := v_1.Args[1]
+ if y != v_1.Args[0] {
+ continue
+ }
+ v.reset(OpAMD64VCMPPD128)
+ v.AuxInt = uint8ToAuxInt(3)
+ v.AddArg2(x, y)
+ return true
+ }
+ break
+ }
+ return false
+}
+func rewriteValueAMD64_OpAMD64VPOR256(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (VPOR256 (VCMPPS256 [3] x x) (VCMPPS256 [3] y y))
+ // result: (VCMPPS256 [3] x y)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ if v_0.Op != OpAMD64VCMPPS256 || auxIntToUint8(v_0.AuxInt) != 3 {
+ continue
+ }
+ x := v_0.Args[1]
+ if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPS256 || auxIntToUint8(v_1.AuxInt) != 3 {
+ continue
+ }
+ y := v_1.Args[1]
+ if y != v_1.Args[0] {
+ continue
+ }
+ v.reset(OpAMD64VCMPPS256)
+ v.AuxInt = uint8ToAuxInt(3)
+ v.AddArg2(x, y)
+ return true
+ }
+ break
+ }
+ // match: (VPOR256 (VCMPPD256 [3] x x) (VCMPPD256 [3] y y))
+ // result: (VCMPPD256 [3] x y)
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ if v_0.Op != OpAMD64VCMPPD256 || auxIntToUint8(v_0.AuxInt) != 3 {
+ continue
+ }
+ x := v_0.Args[1]
+ if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPD256 || auxIntToUint8(v_1.AuxInt) != 3 {
+ continue
+ }
+ y := v_1.Args[1]
+ if y != v_1.Args[0] {
+ continue
+ }
+ v.reset(OpAMD64VCMPPD256)
+ v.AuxInt = uint8ToAuxInt(3)
+ v.AddArg2(x, y)
+ return true
+ }
+ break
+ }
+ return false
+}
func rewriteValueAMD64_OpAMD64VPORD512(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
+ b := v.Block
+ typ := &b.Func.Config.Types
+ // match: (VPORD512 (VPMOVMToVec32x16 (VCMPPS512 [3] x x)) (VPMOVMToVec32x16 (VCMPPS512 [3] y y)))
+ // result: (VPMOVMToVec32x16 (VCMPPS512 [3] x y))
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ if v_0.Op != OpAMD64VPMOVMToVec32x16 {
+ continue
+ }
+ v_0_0 := v_0.Args[0]
+ if v_0_0.Op != OpAMD64VCMPPS512 || auxIntToUint8(v_0_0.AuxInt) != 3 {
+ continue
+ }
+ x := v_0_0.Args[1]
+ if x != v_0_0.Args[0] || v_1.Op != OpAMD64VPMOVMToVec32x16 {
+ continue
+ }
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64VCMPPS512 || auxIntToUint8(v_1_0.AuxInt) != 3 {
+ continue
+ }
+ y := v_1_0.Args[1]
+ if y != v_1_0.Args[0] {
+ continue
+ }
+ v.reset(OpAMD64VPMOVMToVec32x16)
+ v0 := b.NewValue0(v.Pos, OpAMD64VCMPPS512, typ.Mask)
+ v0.AuxInt = uint8ToAuxInt(3)
+ v0.AddArg2(x, y)
+ v.AddArg(v0)
+ return true
+ }
+ break
+ }
+ // match: (VPORD512 (VPMOVMToVec64x8 (VCMPPD512 [3] x x)) (VPMOVMToVec64x8 (VCMPPD512 [3] y y)))
+ // result: (VPMOVMToVec64x8 (VCMPPD512 [3] x y))
+ for {
+ for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+ if v_0.Op != OpAMD64VPMOVMToVec64x8 {
+ continue
+ }
+ v_0_0 := v_0.Args[0]
+ if v_0_0.Op != OpAMD64VCMPPD512 || auxIntToUint8(v_0_0.AuxInt) != 3 {
+ continue
+ }
+ x := v_0_0.Args[1]
+ if x != v_0_0.Args[0] || v_1.Op != OpAMD64VPMOVMToVec64x8 {
+ continue
+ }
+ v_1_0 := v_1.Args[0]
+ if v_1_0.Op != OpAMD64VCMPPD512 || auxIntToUint8(v_1_0.AuxInt) != 3 {
+ continue
+ }
+ y := v_1_0.Args[1]
+ if y != v_1_0.Args[0] {
+ continue
+ }
+ v.reset(OpAMD64VPMOVMToVec64x8)
+ v0 := b.NewValue0(v.Pos, OpAMD64VCMPPD512, typ.Mask)
+ v0.AuxInt = uint8ToAuxInt(3)
+ v0.AddArg2(x, y)
+ v.AddArg(v0)
+ return true
+ }
+ break
+ }
// match: (VPORD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
// cond: canMergeLoad(v, l) && clobber(l)
// result: (VPORD512load {sym} [off] x ptr mem)
testFloat32x16UnaryCompare(t, archsimd.Float32x16.IsNaN, isNaNSlice[float32])
testFloat64x8UnaryCompare(t, archsimd.Float64x8.IsNaN, isNaNSlice[float64])
}
+
+ // Test x.IsNaN().Or(y.IsNaN()), which is optimized to VCMPP(S|D) $3, x, y.
+ want32 := mapCompare(func(x, y float32) bool { return x != x || y != y })
+ want64 := mapCompare(func(x, y float64) bool { return x != x || y != y })
+ testFloat32x4Compare(t,
+ func(x, y archsimd.Float32x4) archsimd.Mask32x4 {
+ return x.IsNaN().Or(y.IsNaN())
+ }, want32)
+ testFloat64x2Compare(t,
+ func(x, y archsimd.Float64x2) archsimd.Mask64x2 {
+ return x.IsNaN().Or(y.IsNaN())
+ }, want64)
+
+ if archsimd.X86.AVX2() {
+ testFloat32x8Compare(t,
+ func(x, y archsimd.Float32x8) archsimd.Mask32x8 {
+ return x.IsNaN().Or(y.IsNaN())
+ }, want32)
+ testFloat64x4Compare(t,
+ func(x, y archsimd.Float64x4) archsimd.Mask64x4 {
+ return x.IsNaN().Or(y.IsNaN())
+ }, want64)
+ }
+
+ if archsimd.X86.AVX512() {
+ testFloat32x16Compare(t,
+ func(x, y archsimd.Float32x16) archsimd.Mask32x16 {
+ return x.IsNaN().Or(y.IsNaN())
+ }, want32)
+ testFloat64x8Compare(t,
+ func(x, y archsimd.Float64x8) archsimd.Mask64x8 {
+ return x.IsNaN().Or(y.IsNaN())
+ }, want64)
+ }
}
// These tests check code generation of simd peephole optimizations.
-//go:build goexperiment.simd
+//go:build goexperiment.simd && amd64
package codegen
-import "simd/archsimd"
+import (
+ "math"
+ "simd/archsimd"
+)
func vptest1() bool {
v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1})
mask := archsimd.Mask16x16FromBits(5)
return x.Add(y).Merge(x, mask) // amd64:`VPBLENDVB\s.*$`
}
+
+var nan = math.NaN()
+var floats64s = []float64{0, 1, 2, nan, 4, nan, 6, 7, 8, 9, 10, 11, nan, 13, 14, 15}
+var sinkInt64s = make([]int64, 100)
+
+func simdIsNaN() {
+ x := archsimd.LoadFloat64x4Slice(floats64s)
+ y := archsimd.LoadFloat64x4Slice(floats64s[4:])
+ a := x.IsNaN()
+ b := y.IsNaN()
+ // amd64:"VCMPPD [$]3," -"VPOR"
+ c := a.Or(b)
+ c.ToInt64x4().StoreSlice(sinkInt64s)
+}
+
+func simdIsNaN512() {
+ x := archsimd.LoadFloat64x8Slice(floats64s)
+ y := archsimd.LoadFloat64x8Slice(floats64s[8:])
+ a := x.IsNaN()
+ b := y.IsNaN()
+ // amd64:"VCMPPD [$]3," -"VPOR"
+ c := a.Or(b)
+ c.ToInt64x8().StoreSlice(sinkInt64s)
+}