From: Cherry Mui Date: Mon, 5 Jan 2026 17:56:08 +0000 (-0500) Subject: simd/archsimd: use V(P)MOVMSK for mask ToBits if possible X-Git-Tag: go1.26rc2~7^2~2 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=9b2e3b9a02bd8872bdbf5c6086674fa6b4bc8ef9;p=gostls13.git simd/archsimd: use V(P)MOVMSK for mask ToBits if possible VPMOVMSKB, VMOVMSKPS, and VMOVMSKPD moves AVX1/2-style masks to integer registers, similar to VPMOV[BWDQ]2M (which moves to mask registers). The former is available on AVX1/2, the latter requires AVX512. So use the former if it is supported, i.e. for 128- and 256-bit vectors with 8-, 32-, and 64-bit elements (16-bit elements always require AVX512). Change-Id: I972195116617ed2faaf95cee5cd6b250e671496c Reviewed-on: https://go-review.googlesource.com/c/go/+/734060 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase Reviewed-by: Junyang Shao --- diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 5ddcb84c59..e9a566d759 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1845,7 +1845,13 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { ssa.OpAMD64VPMOVVec32x16ToM, ssa.OpAMD64VPMOVVec64x2ToM, ssa.OpAMD64VPMOVVec64x4ToM, - ssa.OpAMD64VPMOVVec64x8ToM: + ssa.OpAMD64VPMOVVec64x8ToM, + ssa.OpAMD64VPMOVMSKB128, + ssa.OpAMD64VPMOVMSKB256, + ssa.OpAMD64VMOVMSKPS128, + ssa.OpAMD64VMOVMSKPS256, + ssa.OpAMD64VMOVMSKPD128, + ssa.OpAMD64VMOVMSKPD256: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = simdReg(v.Args[0]) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 9cd23c6286..b49e85b53c 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1679,21 +1679,21 @@ (Cvt8toMask64x8 x) => (VPMOVMToVec64x8 (KMOVBk x)) // masks to integers -(CvtMask8x16to16 x) => (KMOVWi (VPMOVVec8x16ToM x)) -(CvtMask8x32to32 x) => (KMOVDi (VPMOVVec8x32ToM x)) -(CvtMask8x64to64 x) => (KMOVQi (VPMOVVec8x64ToM x)) +(CvtMask8x16to16 ...) => (VPMOVMSKB128 ...) +(CvtMask8x32to32 ...) => (VPMOVMSKB256 ...) +(CvtMask8x64to64 x) => (KMOVQi (VPMOVVec8x64ToM x)) -(CvtMask16x8to8 x) => (KMOVBi (VPMOVVec16x8ToM x)) -(CvtMask16x16to16 x) => (KMOVWi (VPMOVVec16x16ToM x)) -(CvtMask16x32to32 x) => (KMOVDi (VPMOVVec16x32ToM x)) +(CvtMask16x8to8 x) => (KMOVBi (VPMOVVec16x8ToM x)) +(CvtMask16x16to16 x) => (KMOVWi (VPMOVVec16x16ToM x)) +(CvtMask16x32to32 x) => (KMOVDi (VPMOVVec16x32ToM x)) -(CvtMask32x4to8 x) => (KMOVBi (VPMOVVec32x4ToM x)) -(CvtMask32x8to8 x) => (KMOVBi (VPMOVVec32x8ToM x)) -(CvtMask32x16to16 x) => (KMOVWi (VPMOVVec32x16ToM x)) +(CvtMask32x4to8 ...) => (VMOVMSKPS128 ...) +(CvtMask32x8to8 ...) => (VMOVMSKPS256 ...) +(CvtMask32x16to16 x) => (KMOVWi (VPMOVVec32x16ToM x)) -(CvtMask64x2to8 x) => (KMOVBi (VPMOVVec64x2ToM x)) -(CvtMask64x4to8 x) => (KMOVBi (VPMOVVec64x4ToM x)) -(CvtMask64x8to8 x) => (KMOVBi (VPMOVVec64x8ToM x)) +(CvtMask64x2to8 ...) => (VMOVMSKPD128 ...) +(CvtMask64x4to8 ...) => (VMOVMSKPD256 ...) +(CvtMask64x8to8 x) => (KMOVBi (VPMOVVec64x8ToM x)) // optimizations (MOVBstore [off] {sym} ptr (KMOVBi mask) mem) => (KMOVBstore [off] {sym} ptr mask mem) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index 2fb4fdfc96..b13eb5aa21 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -1368,6 +1368,7 @@ func init() { {name: "VPMASK64load512", argLength: 3, reg: vloadk, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=k mask, arg2 = mem {name: "VPMASK64store512", argLength: 4, reg: vstorek, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=k mask, arg3 = mem + // AVX512 moves between int-vector and mask registers {name: "VPMOVMToVec8x16", argLength: 1, reg: kv, asm: "VPMOVM2B"}, {name: "VPMOVMToVec8x32", argLength: 1, reg: kv, asm: "VPMOVM2B"}, {name: "VPMOVMToVec8x64", argLength: 1, reg: kw, asm: "VPMOVM2B"}, @@ -1400,6 +1401,14 @@ func init() { {name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"}, {name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"}, + // AVX1/2 moves from int-vector to bitmask (extracting sign bits) + {name: "VPMOVMSKB128", argLength: 1, reg: vgp, asm: "VPMOVMSKB"}, + {name: "VPMOVMSKB256", argLength: 1, reg: vgp, asm: "VPMOVMSKB"}, + {name: "VMOVMSKPS128", argLength: 1, reg: vgp, asm: "VMOVMSKPS"}, + {name: "VMOVMSKPS256", argLength: 1, reg: vgp, asm: "VMOVMSKPS"}, + {name: "VMOVMSKPD128", argLength: 1, reg: vgp, asm: "VMOVMSKPD"}, + {name: "VMOVMSKPD256", argLength: 1, reg: vgp, asm: "VMOVMSKPD"}, + // X15 is the zero register up to 128-bit. For larger values, we zero it on the fly. {name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true}, {name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index abaf7911d4..7b70dc2686 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1214,6 +1214,12 @@ const ( OpAMD64VPMOVVec64x2ToM OpAMD64VPMOVVec64x4ToM OpAMD64VPMOVVec64x8ToM + OpAMD64VPMOVMSKB128 + OpAMD64VPMOVMSKB256 + OpAMD64VMOVMSKPS128 + OpAMD64VMOVMSKPS256 + OpAMD64VMOVMSKPD128 + OpAMD64VMOVMSKPD256 OpAMD64Zero128 OpAMD64Zero256 OpAMD64Zero512 @@ -20351,6 +20357,84 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMOVMSKB128", + argLen: 1, + asm: x86.AVPMOVMSKB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "VPMOVMSKB256", + argLen: 1, + asm: x86.AVPMOVMSKB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "VMOVMSKPS128", + argLen: 1, + asm: x86.AVMOVMSKPS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "VMOVMSKPS256", + argLen: 1, + asm: x86.AVMOVMSKPS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "VMOVMSKPD128", + argLen: 1, + asm: x86.AVMOVMSKPD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "VMOVMSKPD256", + argLen: 1, + asm: x86.AVMOVMSKPD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, { name: "Zero128", argLen: 0, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 3eb2a6278b..e84bf19c83 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -3050,19 +3050,25 @@ func rewriteValueAMD64(v *Value) bool { case OpCvtMask32x16to16: return rewriteValueAMD64_OpCvtMask32x16to16(v) case OpCvtMask32x4to8: - return rewriteValueAMD64_OpCvtMask32x4to8(v) + v.Op = OpAMD64VMOVMSKPS128 + return true case OpCvtMask32x8to8: - return rewriteValueAMD64_OpCvtMask32x8to8(v) + v.Op = OpAMD64VMOVMSKPS256 + return true case OpCvtMask64x2to8: - return rewriteValueAMD64_OpCvtMask64x2to8(v) + v.Op = OpAMD64VMOVMSKPD128 + return true case OpCvtMask64x4to8: - return rewriteValueAMD64_OpCvtMask64x4to8(v) + v.Op = OpAMD64VMOVMSKPD256 + return true case OpCvtMask64x8to8: return rewriteValueAMD64_OpCvtMask64x8to8(v) case OpCvtMask8x16to16: - return rewriteValueAMD64_OpCvtMask8x16to16(v) + v.Op = OpAMD64VPMOVMSKB128 + return true case OpCvtMask8x32to32: - return rewriteValueAMD64_OpCvtMask8x32to32(v) + v.Op = OpAMD64VPMOVMSKB256 + return true case OpCvtMask8x64to64: return rewriteValueAMD64_OpCvtMask8x64to64(v) case OpDiv128u: @@ -68722,13 +68728,11 @@ func rewriteValueAMD64_OpCvt8toMask64x8(v *Value) bool { func rewriteValueAMD64_OpCvtMask16x16to16(v *Value) bool { v_0 := v.Args[0] b := v.Block - // match: (CvtMask16x16to16 x) - // result: (KMOVWi (VPMOVVec16x16ToM x)) + // match: (CvtMask16x16to16 x) + // result: (KMOVWi (VPMOVVec16x16ToM x)) for { - t := v.Type x := v_0 v.reset(OpAMD64KMOVWi) - v.Type = t v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) v0.AddArg(x) v.AddArg(v0) @@ -68738,13 +68742,11 @@ func rewriteValueAMD64_OpCvtMask16x16to16(v *Value) bool { func rewriteValueAMD64_OpCvtMask16x32to32(v *Value) bool { v_0 := v.Args[0] b := v.Block - // match: (CvtMask16x32to32 x) - // result: (KMOVDi (VPMOVVec16x32ToM x)) + // match: (CvtMask16x32to32 x) + // result: (KMOVDi (VPMOVVec16x32ToM x)) for { - t := v.Type x := v_0 v.reset(OpAMD64KMOVDi) - v.Type = t v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) v0.AddArg(x) v.AddArg(v0) @@ -68754,13 +68756,11 @@ func rewriteValueAMD64_OpCvtMask16x32to32(v *Value) bool { func rewriteValueAMD64_OpCvtMask16x8to8(v *Value) bool { v_0 := v.Args[0] b := v.Block - // match: (CvtMask16x8to8 x) - // result: (KMOVBi (VPMOVVec16x8ToM x)) + // match: (CvtMask16x8to8 x) + // result: (KMOVBi (VPMOVVec16x8ToM x)) for { - t := v.Type x := v_0 v.reset(OpAMD64KMOVBi) - v.Type = t v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) v0.AddArg(x) v.AddArg(v0) @@ -68770,141 +68770,39 @@ func rewriteValueAMD64_OpCvtMask16x8to8(v *Value) bool { func rewriteValueAMD64_OpCvtMask32x16to16(v *Value) bool { v_0 := v.Args[0] b := v.Block - // match: (CvtMask32x16to16 x) - // result: (KMOVWi (VPMOVVec32x16ToM x)) + // match: (CvtMask32x16to16 x) + // result: (KMOVWi (VPMOVVec32x16ToM x)) for { - t := v.Type x := v_0 v.reset(OpAMD64KMOVWi) - v.Type = t v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) v0.AddArg(x) v.AddArg(v0) return true } } -func rewriteValueAMD64_OpCvtMask32x4to8(v *Value) bool { - v_0 := v.Args[0] - b := v.Block - // match: (CvtMask32x4to8 x) - // result: (KMOVBi (VPMOVVec32x4ToM x)) - for { - t := v.Type - x := v_0 - v.reset(OpAMD64KMOVBi) - v.Type = t - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(x) - v.AddArg(v0) - return true - } -} -func rewriteValueAMD64_OpCvtMask32x8to8(v *Value) bool { - v_0 := v.Args[0] - b := v.Block - // match: (CvtMask32x8to8 x) - // result: (KMOVBi (VPMOVVec32x8ToM x)) - for { - t := v.Type - x := v_0 - v.reset(OpAMD64KMOVBi) - v.Type = t - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(x) - v.AddArg(v0) - return true - } -} -func rewriteValueAMD64_OpCvtMask64x2to8(v *Value) bool { - v_0 := v.Args[0] - b := v.Block - // match: (CvtMask64x2to8 x) - // result: (KMOVBi (VPMOVVec64x2ToM x)) - for { - t := v.Type - x := v_0 - v.reset(OpAMD64KMOVBi) - v.Type = t - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(x) - v.AddArg(v0) - return true - } -} -func rewriteValueAMD64_OpCvtMask64x4to8(v *Value) bool { - v_0 := v.Args[0] - b := v.Block - // match: (CvtMask64x4to8 x) - // result: (KMOVBi (VPMOVVec64x4ToM x)) - for { - t := v.Type - x := v_0 - v.reset(OpAMD64KMOVBi) - v.Type = t - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(x) - v.AddArg(v0) - return true - } -} func rewriteValueAMD64_OpCvtMask64x8to8(v *Value) bool { v_0 := v.Args[0] b := v.Block - // match: (CvtMask64x8to8 x) - // result: (KMOVBi (VPMOVVec64x8ToM x)) + // match: (CvtMask64x8to8 x) + // result: (KMOVBi (VPMOVVec64x8ToM x)) for { - t := v.Type x := v_0 v.reset(OpAMD64KMOVBi) - v.Type = t v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) v0.AddArg(x) v.AddArg(v0) return true } } -func rewriteValueAMD64_OpCvtMask8x16to16(v *Value) bool { - v_0 := v.Args[0] - b := v.Block - // match: (CvtMask8x16to16 x) - // result: (KMOVWi (VPMOVVec8x16ToM x)) - for { - t := v.Type - x := v_0 - v.reset(OpAMD64KMOVWi) - v.Type = t - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) - v0.AddArg(x) - v.AddArg(v0) - return true - } -} -func rewriteValueAMD64_OpCvtMask8x32to32(v *Value) bool { - v_0 := v.Args[0] - b := v.Block - // match: (CvtMask8x32to32 x) - // result: (KMOVDi (VPMOVVec8x32ToM x)) - for { - t := v.Type - x := v_0 - v.reset(OpAMD64KMOVDi) - v.Type = t - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) - v0.AddArg(x) - v.AddArg(v0) - return true - } -} func rewriteValueAMD64_OpCvtMask8x64to64(v *Value) bool { v_0 := v.Args[0] b := v.Block - // match: (CvtMask8x64to64 x) - // result: (KMOVQi (VPMOVVec8x64ToM x)) + // match: (CvtMask8x64to64 x) + // result: (KMOVQi (VPMOVVec8x64ToM x)) for { - t := v.Type x := v_0 v.reset(OpAMD64KMOVQi) - v.Type = t v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) v0.AddArg(x) v.AddArg(v0) diff --git a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go index d8c4481296..dd3a75eb44 100644 --- a/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go +++ b/src/simd/archsimd/_gen/simdgen/gen_simdTypes.go @@ -93,6 +93,33 @@ func (x simdType) MaskedStoreDoc() string { } } +func (x simdType) ToBitsDoc() string { + if x.Size == 512 || x.ElemBits() == 16 { + return fmt.Sprintf("// Asm: KMOV%s, CPU Features: AVX512", x.IntelSizeSuffix()) + } + // 128/256 bit vectors with 8, 32, 64 bit elements + var asm string + var feat string + switch x.ElemBits() { + case 8: + asm = "VPMOVMSKB" + if x.Size == 256 { + feat = "AVX2" + } else { + feat = "AVX" + } + case 32: + asm = "VMOVMSKPS" + feat = "AVX" + case 64: + asm = "VMOVMSKPD" + feat = "AVX" + default: + panic("unexpected ElemBits") + } + return fmt.Sprintf("// Asm: %s, CPU Features: %s", asm, feat) +} + func compareSimdTypes(x, y simdType) int { // "vreg" then "mask" if c := -compareNatural(x.Type, y.Type); c != 0 { @@ -210,7 +237,7 @@ func {{.Name}}FromBits(y uint{{.LanesContainer}}) {{.Name}} // Only the lower {{.Lanes}} bits of y are used. {{- end}} // -// Asm: KMOV{{.IntelSizeSuffix}}, CPU Features: AVX512 +{{.ToBitsDoc}} func (x {{.Name}}) ToBits() uint{{.LanesContainer}} ` diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go index 671ec05e79..36bde92455 100644 --- a/src/simd/archsimd/internal/simd_test/simd_test.go +++ b/src/simd/archsimd/internal/simd_test/simd_test.go @@ -379,12 +379,79 @@ func TestBitMaskFromBitsLoad(t *testing.T) { } func TestBitMaskToBits(t *testing.T) { - if !archsimd.X86.AVX512() { - t.Skip("Test requires X86.AVX512, not available on this hardware") - return + int8s := []int8{ + 0, 1, 1, 0, 0, 1, 0, 1, + 1, 0, 1, 1, 0, 0, 1, 0, + 1, 0, 0, 1, 1, 0, 1, 0, + 0, 1, 1, 0, 0, 1, 0, 1, + 1, 0, 0, 1, 0, 1, 1, 0, + 0, 1, 0, 1, 1, 0, 0, 1, + 1, 0, 1, 0, 0, 1, 1, 0, + 0, 1, 1, 0, 1, 0, 0, 1, + } + int16s := make([]int16, 32) + for i := range int16s { + int16s[i] = int16(int8s[i]) + } + int32s := make([]int32, 16) + for i := range int32s { + int32s[i] = int32(int8s[i]) + } + int64s := make([]int64, 8) + for i := range int64s { + int64s[i] = int64(int8s[i]) + } + want64 := uint64(0) + for i := range int8s { + want64 |= uint64(int8s[i]) << i + } + want32 := uint32(want64) + want16 := uint16(want64) + want8 := uint8(want64) + want4 := want8 & 0b1111 + want2 := want4 & 0b11 + + if v := archsimd.LoadInt8x16Slice(int8s[:16]).ToMask().ToBits(); v != want16 { + t.Errorf("want %b, got %b", want16, v) + } + if v := archsimd.LoadInt32x4Slice(int32s[:4]).ToMask().ToBits(); v != want4 { + t.Errorf("want %b, got %b", want4, v) + } + if v := archsimd.LoadInt32x8Slice(int32s[:8]).ToMask().ToBits(); v != want8 { + t.Errorf("want %b, got %b", want8, v) + } + if v := archsimd.LoadInt64x2Slice(int64s[:2]).ToMask().ToBits(); v != want2 { + t.Errorf("want %b, got %b", want2, v) + } + if v := archsimd.LoadInt64x4Slice(int64s[:4]).ToMask().ToBits(); v != want4 { + t.Errorf("want %b, got %b", want4, v) } - if v := archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 { - t.Errorf("Want 0b101, got %b", v) + + if archsimd.X86.AVX2() { + if v := archsimd.LoadInt8x32Slice(int8s[:32]).ToMask().ToBits(); v != want32 { + t.Errorf("want %b, got %b", want32, v) + } + } + + if archsimd.X86.AVX512() { + if v := archsimd.LoadInt8x64Slice(int8s).ToMask().ToBits(); v != want64 { + t.Errorf("want %b, got %b", want64, v) + } + if v := archsimd.LoadInt16x8Slice(int16s[:8]).ToMask().ToBits(); v != want8 { + t.Errorf("want %b, got %b", want8, v) + } + if v := archsimd.LoadInt16x16Slice(int16s[:16]).ToMask().ToBits(); v != want16 { + t.Errorf("want %b, got %b", want16, v) + } + if v := archsimd.LoadInt16x32Slice(int16s).ToMask().ToBits(); v != want32 { + t.Errorf("want %b, got %b", want32, v) + } + if v := archsimd.LoadInt32x16Slice(int32s).ToMask().ToBits(); v != want16 { + t.Errorf("want %b, got %b", want16, v) + } + if v := archsimd.LoadInt64x8Slice(int64s).ToMask().ToBits(); v != want8 { + t.Errorf("want %b, got %b", want8, v) + } } } diff --git a/src/simd/archsimd/types_amd64.go b/src/simd/archsimd/types_amd64.go index f39549c705..3d0a49dc09 100644 --- a/src/simd/archsimd/types_amd64.go +++ b/src/simd/archsimd/types_amd64.go @@ -308,7 +308,7 @@ func Mask8x16FromBits(y uint16) Mask8x16 // ToBits constructs a bitmap from a Mask8x16, where 1 means set for the indexed element, 0 means unset. // -// Asm: KMOVB, CPU Features: AVX512 +// Asm: VPMOVMSKB, CPU Features: AVX func (x Mask8x16) ToBits() uint16 // Mask16x8 is a mask for a SIMD vector of 8 16-bit elements. @@ -342,7 +342,7 @@ func Mask32x4FromBits(y uint8) Mask32x4 // ToBits constructs a bitmap from a Mask32x4, where 1 means set for the indexed element, 0 means unset. // Only the lower 4 bits of y are used. // -// Asm: KMOVD, CPU Features: AVX512 +// Asm: VMOVMSKPS, CPU Features: AVX func (x Mask32x4) ToBits() uint8 // Mask64x2 is a mask for a SIMD vector of 2 64-bit elements. @@ -360,7 +360,7 @@ func Mask64x2FromBits(y uint8) Mask64x2 // ToBits constructs a bitmap from a Mask64x2, where 1 means set for the indexed element, 0 means unset. // Only the lower 2 bits of y are used. // -// Asm: KMOVQ, CPU Features: AVX512 +// Asm: VMOVMSKPD, CPU Features: AVX func (x Mask64x2) ToBits() uint8 // v256 is a tag type that tells the compiler that this is really 256-bit SIMD @@ -667,7 +667,7 @@ func Mask8x32FromBits(y uint32) Mask8x32 // ToBits constructs a bitmap from a Mask8x32, where 1 means set for the indexed element, 0 means unset. // -// Asm: KMOVB, CPU Features: AVX512 +// Asm: VPMOVMSKB, CPU Features: AVX2 func (x Mask8x32) ToBits() uint32 // Mask16x16 is a mask for a SIMD vector of 16 16-bit elements. @@ -699,7 +699,7 @@ func Mask32x8FromBits(y uint8) Mask32x8 // ToBits constructs a bitmap from a Mask32x8, where 1 means set for the indexed element, 0 means unset. // -// Asm: KMOVD, CPU Features: AVX512 +// Asm: VMOVMSKPS, CPU Features: AVX func (x Mask32x8) ToBits() uint8 // Mask64x4 is a mask for a SIMD vector of 4 64-bit elements. @@ -717,7 +717,7 @@ func Mask64x4FromBits(y uint8) Mask64x4 // ToBits constructs a bitmap from a Mask64x4, where 1 means set for the indexed element, 0 means unset. // Only the lower 4 bits of y are used. // -// Asm: KMOVQ, CPU Features: AVX512 +// Asm: VMOVMSKPD, CPU Features: AVX func (x Mask64x4) ToBits() uint8 // v512 is a tag type that tells the compiler that this is really 512-bit SIMD