ssa.OpAMD64VPMOVVec32x16ToM,
ssa.OpAMD64VPMOVVec64x2ToM,
ssa.OpAMD64VPMOVVec64x4ToM,
- ssa.OpAMD64VPMOVVec64x8ToM:
+ ssa.OpAMD64VPMOVVec64x8ToM,
+ ssa.OpAMD64VPMOVMSKB128,
+ ssa.OpAMD64VPMOVMSKB256,
+ ssa.OpAMD64VMOVMSKPS128,
+ ssa.OpAMD64VMOVMSKPS256,
+ ssa.OpAMD64VMOVMSKPD128,
+ ssa.OpAMD64VMOVMSKPD256:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
(Cvt8toMask64x8 <t> x) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVBk <t> x))
// masks to integers
-(CvtMask8x16to16 <t> x) => (KMOVWi <t> (VPMOVVec8x16ToM <types.TypeMask> x))
-(CvtMask8x32to32 <t> x) => (KMOVDi <t> (VPMOVVec8x32ToM <types.TypeMask> x))
-(CvtMask8x64to64 <t> x) => (KMOVQi <t> (VPMOVVec8x64ToM <types.TypeMask> x))
+(CvtMask8x16to16 ...) => (VPMOVMSKB128 ...)
+(CvtMask8x32to32 ...) => (VPMOVMSKB256 ...)
+(CvtMask8x64to64 x) => (KMOVQi (VPMOVVec8x64ToM <types.TypeMask> x))
-(CvtMask16x8to8 <t> x) => (KMOVBi <t> (VPMOVVec16x8ToM <types.TypeMask> x))
-(CvtMask16x16to16 <t> x) => (KMOVWi <t> (VPMOVVec16x16ToM <types.TypeMask> x))
-(CvtMask16x32to32 <t> x) => (KMOVDi <t> (VPMOVVec16x32ToM <types.TypeMask> x))
+(CvtMask16x8to8 x) => (KMOVBi (VPMOVVec16x8ToM <types.TypeMask> x))
+(CvtMask16x16to16 x) => (KMOVWi (VPMOVVec16x16ToM <types.TypeMask> x))
+(CvtMask16x32to32 x) => (KMOVDi (VPMOVVec16x32ToM <types.TypeMask> x))
-(CvtMask32x4to8 <t> x) => (KMOVBi <t> (VPMOVVec32x4ToM <types.TypeMask> x))
-(CvtMask32x8to8 <t> x) => (KMOVBi <t> (VPMOVVec32x8ToM <types.TypeMask> x))
-(CvtMask32x16to16 <t> x) => (KMOVWi <t> (VPMOVVec32x16ToM <types.TypeMask> x))
+(CvtMask32x4to8 ...) => (VMOVMSKPS128 ...)
+(CvtMask32x8to8 ...) => (VMOVMSKPS256 ...)
+(CvtMask32x16to16 x) => (KMOVWi (VPMOVVec32x16ToM <types.TypeMask> x))
-(CvtMask64x2to8 <t> x) => (KMOVBi <t> (VPMOVVec64x2ToM <types.TypeMask> x))
-(CvtMask64x4to8 <t> x) => (KMOVBi <t> (VPMOVVec64x4ToM <types.TypeMask> x))
-(CvtMask64x8to8 <t> x) => (KMOVBi <t> (VPMOVVec64x8ToM <types.TypeMask> x))
+(CvtMask64x2to8 ...) => (VMOVMSKPD128 ...)
+(CvtMask64x4to8 ...) => (VMOVMSKPD256 ...)
+(CvtMask64x8to8 x) => (KMOVBi (VPMOVVec64x8ToM <types.TypeMask> x))
// optimizations
(MOVBstore [off] {sym} ptr (KMOVBi mask) mem) => (KMOVBstore [off] {sym} ptr mask mem)
{name: "VPMASK64load512", argLength: 3, reg: vloadk, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=k mask, arg2 = mem
{name: "VPMASK64store512", argLength: 4, reg: vstorek, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=k mask, arg3 = mem
+ // AVX512 moves between int-vector and mask registers
{name: "VPMOVMToVec8x16", argLength: 1, reg: kv, asm: "VPMOVM2B"},
{name: "VPMOVMToVec8x32", argLength: 1, reg: kv, asm: "VPMOVM2B"},
{name: "VPMOVMToVec8x64", argLength: 1, reg: kw, asm: "VPMOVM2B"},
{name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"},
{name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"},
+ // AVX1/2 moves from int-vector to bitmask (extracting sign bits)
+ {name: "VPMOVMSKB128", argLength: 1, reg: vgp, asm: "VPMOVMSKB"},
+ {name: "VPMOVMSKB256", argLength: 1, reg: vgp, asm: "VPMOVMSKB"},
+ {name: "VMOVMSKPS128", argLength: 1, reg: vgp, asm: "VMOVMSKPS"},
+ {name: "VMOVMSKPS256", argLength: 1, reg: vgp, asm: "VMOVMSKPS"},
+ {name: "VMOVMSKPD128", argLength: 1, reg: vgp, asm: "VMOVMSKPD"},
+ {name: "VMOVMSKPD256", argLength: 1, reg: vgp, asm: "VMOVMSKPD"},
+
// X15 is the zero register up to 128-bit. For larger values, we zero it on the fly.
{name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
{name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"},
OpAMD64VPMOVVec64x2ToM
OpAMD64VPMOVVec64x4ToM
OpAMD64VPMOVVec64x8ToM
+ OpAMD64VPMOVMSKB128
+ OpAMD64VPMOVMSKB256
+ OpAMD64VMOVMSKPS128
+ OpAMD64VMOVMSKPS256
+ OpAMD64VMOVMSKPD128
+ OpAMD64VMOVMSKPD256
OpAMD64Zero128
OpAMD64Zero256
OpAMD64Zero512
},
},
},
+ {
+ name: "VPMOVMSKB128",
+ argLen: 1,
+ asm: x86.AVPMOVMSKB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "VPMOVMSKB256",
+ argLen: 1,
+ asm: x86.AVPMOVMSKB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "VMOVMSKPS128",
+ argLen: 1,
+ asm: x86.AVMOVMSKPS,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "VMOVMSKPS256",
+ argLen: 1,
+ asm: x86.AVMOVMSKPS,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "VMOVMSKPD128",
+ argLen: 1,
+ asm: x86.AVMOVMSKPD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "VMOVMSKPD256",
+ argLen: 1,
+ asm: x86.AVMOVMSKPD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
{
name: "Zero128",
argLen: 0,
case OpCvtMask32x16to16:
return rewriteValueAMD64_OpCvtMask32x16to16(v)
case OpCvtMask32x4to8:
- return rewriteValueAMD64_OpCvtMask32x4to8(v)
+ v.Op = OpAMD64VMOVMSKPS128
+ return true
case OpCvtMask32x8to8:
- return rewriteValueAMD64_OpCvtMask32x8to8(v)
+ v.Op = OpAMD64VMOVMSKPS256
+ return true
case OpCvtMask64x2to8:
- return rewriteValueAMD64_OpCvtMask64x2to8(v)
+ v.Op = OpAMD64VMOVMSKPD128
+ return true
case OpCvtMask64x4to8:
- return rewriteValueAMD64_OpCvtMask64x4to8(v)
+ v.Op = OpAMD64VMOVMSKPD256
+ return true
case OpCvtMask64x8to8:
return rewriteValueAMD64_OpCvtMask64x8to8(v)
case OpCvtMask8x16to16:
- return rewriteValueAMD64_OpCvtMask8x16to16(v)
+ v.Op = OpAMD64VPMOVMSKB128
+ return true
case OpCvtMask8x32to32:
- return rewriteValueAMD64_OpCvtMask8x32to32(v)
+ v.Op = OpAMD64VPMOVMSKB256
+ return true
case OpCvtMask8x64to64:
return rewriteValueAMD64_OpCvtMask8x64to64(v)
case OpDiv128u:
func rewriteValueAMD64_OpCvtMask16x16to16(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
- // match: (CvtMask16x16to16 <t> x)
- // result: (KMOVWi <t> (VPMOVVec16x16ToM <types.TypeMask> x))
+ // match: (CvtMask16x16to16 x)
+ // result: (KMOVWi (VPMOVVec16x16ToM <types.TypeMask> x))
for {
- t := v.Type
x := v_0
v.reset(OpAMD64KMOVWi)
- v.Type = t
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
v0.AddArg(x)
v.AddArg(v0)
func rewriteValueAMD64_OpCvtMask16x32to32(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
- // match: (CvtMask16x32to32 <t> x)
- // result: (KMOVDi <t> (VPMOVVec16x32ToM <types.TypeMask> x))
+ // match: (CvtMask16x32to32 x)
+ // result: (KMOVDi (VPMOVVec16x32ToM <types.TypeMask> x))
for {
- t := v.Type
x := v_0
v.reset(OpAMD64KMOVDi)
- v.Type = t
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
v0.AddArg(x)
v.AddArg(v0)
func rewriteValueAMD64_OpCvtMask16x8to8(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
- // match: (CvtMask16x8to8 <t> x)
- // result: (KMOVBi <t> (VPMOVVec16x8ToM <types.TypeMask> x))
+ // match: (CvtMask16x8to8 x)
+ // result: (KMOVBi (VPMOVVec16x8ToM <types.TypeMask> x))
for {
- t := v.Type
x := v_0
v.reset(OpAMD64KMOVBi)
- v.Type = t
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
v0.AddArg(x)
v.AddArg(v0)
func rewriteValueAMD64_OpCvtMask32x16to16(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
- // match: (CvtMask32x16to16 <t> x)
- // result: (KMOVWi <t> (VPMOVVec32x16ToM <types.TypeMask> x))
+ // match: (CvtMask32x16to16 x)
+ // result: (KMOVWi (VPMOVVec32x16ToM <types.TypeMask> x))
for {
- t := v.Type
x := v_0
v.reset(OpAMD64KMOVWi)
- v.Type = t
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
-func rewriteValueAMD64_OpCvtMask32x4to8(v *Value) bool {
- v_0 := v.Args[0]
- b := v.Block
- // match: (CvtMask32x4to8 <t> x)
- // result: (KMOVBi <t> (VPMOVVec32x4ToM <types.TypeMask> x))
- for {
- t := v.Type
- x := v_0
- v.reset(OpAMD64KMOVBi)
- v.Type = t
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
- v0.AddArg(x)
- v.AddArg(v0)
- return true
- }
-}
-func rewriteValueAMD64_OpCvtMask32x8to8(v *Value) bool {
- v_0 := v.Args[0]
- b := v.Block
- // match: (CvtMask32x8to8 <t> x)
- // result: (KMOVBi <t> (VPMOVVec32x8ToM <types.TypeMask> x))
- for {
- t := v.Type
- x := v_0
- v.reset(OpAMD64KMOVBi)
- v.Type = t
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
- v0.AddArg(x)
- v.AddArg(v0)
- return true
- }
-}
-func rewriteValueAMD64_OpCvtMask64x2to8(v *Value) bool {
- v_0 := v.Args[0]
- b := v.Block
- // match: (CvtMask64x2to8 <t> x)
- // result: (KMOVBi <t> (VPMOVVec64x2ToM <types.TypeMask> x))
- for {
- t := v.Type
- x := v_0
- v.reset(OpAMD64KMOVBi)
- v.Type = t
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
- v0.AddArg(x)
- v.AddArg(v0)
- return true
- }
-}
-func rewriteValueAMD64_OpCvtMask64x4to8(v *Value) bool {
- v_0 := v.Args[0]
- b := v.Block
- // match: (CvtMask64x4to8 <t> x)
- // result: (KMOVBi <t> (VPMOVVec64x4ToM <types.TypeMask> x))
- for {
- t := v.Type
- x := v_0
- v.reset(OpAMD64KMOVBi)
- v.Type = t
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
- v0.AddArg(x)
- v.AddArg(v0)
- return true
- }
-}
func rewriteValueAMD64_OpCvtMask64x8to8(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
- // match: (CvtMask64x8to8 <t> x)
- // result: (KMOVBi <t> (VPMOVVec64x8ToM <types.TypeMask> x))
+ // match: (CvtMask64x8to8 x)
+ // result: (KMOVBi (VPMOVVec64x8ToM <types.TypeMask> x))
for {
- t := v.Type
x := v_0
v.reset(OpAMD64KMOVBi)
- v.Type = t
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
-func rewriteValueAMD64_OpCvtMask8x16to16(v *Value) bool {
- v_0 := v.Args[0]
- b := v.Block
- // match: (CvtMask8x16to16 <t> x)
- // result: (KMOVWi <t> (VPMOVVec8x16ToM <types.TypeMask> x))
- for {
- t := v.Type
- x := v_0
- v.reset(OpAMD64KMOVWi)
- v.Type = t
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
- v0.AddArg(x)
- v.AddArg(v0)
- return true
- }
-}
-func rewriteValueAMD64_OpCvtMask8x32to32(v *Value) bool {
- v_0 := v.Args[0]
- b := v.Block
- // match: (CvtMask8x32to32 <t> x)
- // result: (KMOVDi <t> (VPMOVVec8x32ToM <types.TypeMask> x))
- for {
- t := v.Type
- x := v_0
- v.reset(OpAMD64KMOVDi)
- v.Type = t
- v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
- v0.AddArg(x)
- v.AddArg(v0)
- return true
- }
-}
func rewriteValueAMD64_OpCvtMask8x64to64(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
- // match: (CvtMask8x64to64 <t> x)
- // result: (KMOVQi <t> (VPMOVVec8x64ToM <types.TypeMask> x))
+ // match: (CvtMask8x64to64 x)
+ // result: (KMOVQi (VPMOVVec8x64ToM <types.TypeMask> x))
for {
- t := v.Type
x := v_0
v.reset(OpAMD64KMOVQi)
- v.Type = t
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
v0.AddArg(x)
v.AddArg(v0)
}
}
+func (x simdType) ToBitsDoc() string {
+ if x.Size == 512 || x.ElemBits() == 16 {
+ return fmt.Sprintf("// Asm: KMOV%s, CPU Features: AVX512", x.IntelSizeSuffix())
+ }
+ // 128/256 bit vectors with 8, 32, 64 bit elements
+ var asm string
+ var feat string
+ switch x.ElemBits() {
+ case 8:
+ asm = "VPMOVMSKB"
+ if x.Size == 256 {
+ feat = "AVX2"
+ } else {
+ feat = "AVX"
+ }
+ case 32:
+ asm = "VMOVMSKPS"
+ feat = "AVX"
+ case 64:
+ asm = "VMOVMSKPD"
+ feat = "AVX"
+ default:
+ panic("unexpected ElemBits")
+ }
+ return fmt.Sprintf("// Asm: %s, CPU Features: %s", asm, feat)
+}
+
func compareSimdTypes(x, y simdType) int {
// "vreg" then "mask"
if c := -compareNatural(x.Type, y.Type); c != 0 {
// Only the lower {{.Lanes}} bits of y are used.
{{- end}}
//
-// Asm: KMOV{{.IntelSizeSuffix}}, CPU Features: AVX512
+{{.ToBitsDoc}}
func (x {{.Name}}) ToBits() uint{{.LanesContainer}}
`
}
func TestBitMaskToBits(t *testing.T) {
- if !archsimd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
+ int8s := []int8{
+ 0, 1, 1, 0, 0, 1, 0, 1,
+ 1, 0, 1, 1, 0, 0, 1, 0,
+ 1, 0, 0, 1, 1, 0, 1, 0,
+ 0, 1, 1, 0, 0, 1, 0, 1,
+ 1, 0, 0, 1, 0, 1, 1, 0,
+ 0, 1, 0, 1, 1, 0, 0, 1,
+ 1, 0, 1, 0, 0, 1, 1, 0,
+ 0, 1, 1, 0, 1, 0, 0, 1,
+ }
+ int16s := make([]int16, 32)
+ for i := range int16s {
+ int16s[i] = int16(int8s[i])
+ }
+ int32s := make([]int32, 16)
+ for i := range int32s {
+ int32s[i] = int32(int8s[i])
+ }
+ int64s := make([]int64, 8)
+ for i := range int64s {
+ int64s[i] = int64(int8s[i])
+ }
+ want64 := uint64(0)
+ for i := range int8s {
+ want64 |= uint64(int8s[i]) << i
+ }
+ want32 := uint32(want64)
+ want16 := uint16(want64)
+ want8 := uint8(want64)
+ want4 := want8 & 0b1111
+ want2 := want4 & 0b11
+
+ if v := archsimd.LoadInt8x16Slice(int8s[:16]).ToMask().ToBits(); v != want16 {
+ t.Errorf("want %b, got %b", want16, v)
+ }
+ if v := archsimd.LoadInt32x4Slice(int32s[:4]).ToMask().ToBits(); v != want4 {
+ t.Errorf("want %b, got %b", want4, v)
+ }
+ if v := archsimd.LoadInt32x8Slice(int32s[:8]).ToMask().ToBits(); v != want8 {
+ t.Errorf("want %b, got %b", want8, v)
+ }
+ if v := archsimd.LoadInt64x2Slice(int64s[:2]).ToMask().ToBits(); v != want2 {
+ t.Errorf("want %b, got %b", want2, v)
+ }
+ if v := archsimd.LoadInt64x4Slice(int64s[:4]).ToMask().ToBits(); v != want4 {
+ t.Errorf("want %b, got %b", want4, v)
}
- if v := archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 {
- t.Errorf("Want 0b101, got %b", v)
+
+ if archsimd.X86.AVX2() {
+ if v := archsimd.LoadInt8x32Slice(int8s[:32]).ToMask().ToBits(); v != want32 {
+ t.Errorf("want %b, got %b", want32, v)
+ }
+ }
+
+ if archsimd.X86.AVX512() {
+ if v := archsimd.LoadInt8x64Slice(int8s).ToMask().ToBits(); v != want64 {
+ t.Errorf("want %b, got %b", want64, v)
+ }
+ if v := archsimd.LoadInt16x8Slice(int16s[:8]).ToMask().ToBits(); v != want8 {
+ t.Errorf("want %b, got %b", want8, v)
+ }
+ if v := archsimd.LoadInt16x16Slice(int16s[:16]).ToMask().ToBits(); v != want16 {
+ t.Errorf("want %b, got %b", want16, v)
+ }
+ if v := archsimd.LoadInt16x32Slice(int16s).ToMask().ToBits(); v != want32 {
+ t.Errorf("want %b, got %b", want32, v)
+ }
+ if v := archsimd.LoadInt32x16Slice(int32s).ToMask().ToBits(); v != want16 {
+ t.Errorf("want %b, got %b", want16, v)
+ }
+ if v := archsimd.LoadInt64x8Slice(int64s).ToMask().ToBits(); v != want8 {
+ t.Errorf("want %b, got %b", want8, v)
+ }
}
}
// ToBits constructs a bitmap from a Mask8x16, where 1 means set for the indexed element, 0 means unset.
//
-// Asm: KMOVB, CPU Features: AVX512
+// Asm: VPMOVMSKB, CPU Features: AVX
func (x Mask8x16) ToBits() uint16
// Mask16x8 is a mask for a SIMD vector of 8 16-bit elements.
// ToBits constructs a bitmap from a Mask32x4, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
-// Asm: KMOVD, CPU Features: AVX512
+// Asm: VMOVMSKPS, CPU Features: AVX
func (x Mask32x4) ToBits() uint8
// Mask64x2 is a mask for a SIMD vector of 2 64-bit elements.
// ToBits constructs a bitmap from a Mask64x2, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
//
-// Asm: KMOVQ, CPU Features: AVX512
+// Asm: VMOVMSKPD, CPU Features: AVX
func (x Mask64x2) ToBits() uint8
// v256 is a tag type that tells the compiler that this is really 256-bit SIMD
// ToBits constructs a bitmap from a Mask8x32, where 1 means set for the indexed element, 0 means unset.
//
-// Asm: KMOVB, CPU Features: AVX512
+// Asm: VPMOVMSKB, CPU Features: AVX2
func (x Mask8x32) ToBits() uint32
// Mask16x16 is a mask for a SIMD vector of 16 16-bit elements.
// ToBits constructs a bitmap from a Mask32x8, where 1 means set for the indexed element, 0 means unset.
//
-// Asm: KMOVD, CPU Features: AVX512
+// Asm: VMOVMSKPS, CPU Features: AVX
func (x Mask32x8) ToBits() uint8
// Mask64x4 is a mask for a SIMD vector of 4 64-bit elements.
// ToBits constructs a bitmap from a Mask64x4, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
-// Asm: KMOVQ, CPU Features: AVX512
+// Asm: VMOVMSKPD, CPU Features: AVX
func (x Mask64x4) ToBits() uint8
// v512 is a tag type that tells the compiler that this is really 512-bit SIMD