]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile, simd: add Expand
authorJunyang Shao <shaojunyang@google.com>
Tue, 5 Aug 2025 19:42:12 +0000 (19:42 +0000)
committerJunyang Shao <shaojunyang@google.com>
Wed, 6 Aug 2025 20:50:57 +0000 (13:50 -0700)
This CL is generated by CL 693336.

Change-Id: Ic1712d49fcad0544fa3c19b0249d8bc65b347104
Reviewed-on: https://go-review.googlesource.com/c/go/+/693375
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/cmd/compile/internal/amd64/simdssa.go
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/cmd/compile/internal/ssagen/simdintrinsics.go
src/simd/ops_amd64.go
src/simd/simd_test.go

index 7a0a0be58fa32908318c58eee274c1596626acad..b778cd7994a7b331aac8023be1f38953d57a1b3f 100644 (file)
@@ -644,6 +644,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VCVTPS2UDQMasked128,
                ssa.OpAMD64VCVTPS2UDQMasked256,
                ssa.OpAMD64VCVTPS2UDQMasked512,
+               ssa.OpAMD64VEXPANDPSMasked128,
+               ssa.OpAMD64VEXPANDPSMasked256,
+               ssa.OpAMD64VEXPANDPSMasked512,
+               ssa.OpAMD64VEXPANDPDMasked128,
+               ssa.OpAMD64VEXPANDPDMasked256,
+               ssa.OpAMD64VEXPANDPDMasked512,
+               ssa.OpAMD64VPEXPANDBMasked128,
+               ssa.OpAMD64VPEXPANDBMasked256,
+               ssa.OpAMD64VPEXPANDBMasked512,
+               ssa.OpAMD64VPEXPANDWMasked128,
+               ssa.OpAMD64VPEXPANDWMasked256,
+               ssa.OpAMD64VPEXPANDWMasked512,
+               ssa.OpAMD64VPEXPANDDMasked128,
+               ssa.OpAMD64VPEXPANDDMasked256,
+               ssa.OpAMD64VPEXPANDDMasked512,
+               ssa.OpAMD64VPEXPANDQMasked128,
+               ssa.OpAMD64VPEXPANDQMasked256,
+               ssa.OpAMD64VPEXPANDQMasked512,
                ssa.OpAMD64VPOPCNTBMasked128,
                ssa.OpAMD64VPOPCNTBMasked256,
                ssa.OpAMD64VPOPCNTBMasked512,
@@ -1229,6 +1247,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VDIVPDMasked128,
                ssa.OpAMD64VDIVPDMasked256,
                ssa.OpAMD64VDIVPDMasked512,
+               ssa.OpAMD64VEXPANDPSMasked128,
+               ssa.OpAMD64VEXPANDPSMasked256,
+               ssa.OpAMD64VEXPANDPSMasked512,
+               ssa.OpAMD64VEXPANDPDMasked128,
+               ssa.OpAMD64VEXPANDPDMasked256,
+               ssa.OpAMD64VEXPANDPDMasked512,
+               ssa.OpAMD64VPEXPANDBMasked128,
+               ssa.OpAMD64VPEXPANDBMasked256,
+               ssa.OpAMD64VPEXPANDBMasked512,
+               ssa.OpAMD64VPEXPANDWMasked128,
+               ssa.OpAMD64VPEXPANDWMasked256,
+               ssa.OpAMD64VPEXPANDWMasked512,
+               ssa.OpAMD64VPEXPANDDMasked128,
+               ssa.OpAMD64VPEXPANDDMasked256,
+               ssa.OpAMD64VPEXPANDDMasked512,
+               ssa.OpAMD64VPEXPANDQMasked128,
+               ssa.OpAMD64VPEXPANDQMasked256,
+               ssa.OpAMD64VPEXPANDQMasked512,
                ssa.OpAMD64VFMADD213PSMasked128,
                ssa.OpAMD64VFMADD213PSMasked256,
                ssa.OpAMD64VFMADD213PSMasked512,
index 316db1b84110686361826fcee37d1d40cc2020db..ae29a9117ea16c5fe38743745d2e74d975bdb7fa 100644 (file)
 (EqualMaskedUint64x2 x y mask) => (VPMOVMToVec64x2 (VPCMPUQMasked128 [0] x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
 (EqualMaskedUint64x4 x y mask) => (VPMOVMToVec64x4 (VPCMPUQMasked256 [0] x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
 (EqualMaskedUint64x8 x y mask) => (VPMOVMToVec64x8 (VPCMPUQMasked512 [0] x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
+(ExpandFloat32x4 x mask) => (VEXPANDPSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(ExpandFloat32x8 x mask) => (VEXPANDPSMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(ExpandFloat32x16 x mask) => (VEXPANDPSMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ExpandFloat64x2 x mask) => (VEXPANDPDMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(ExpandFloat64x4 x mask) => (VEXPANDPDMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(ExpandFloat64x8 x mask) => (VEXPANDPDMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ExpandInt8x16 x mask) => (VPEXPANDBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(ExpandInt8x32 x mask) => (VPEXPANDBMasked256 x (VPMOVVec8x32ToM <types.TypeMask> mask))
+(ExpandInt8x64 x mask) => (VPEXPANDBMasked512 x (VPMOVVec8x64ToM <types.TypeMask> mask))
+(ExpandInt16x8 x mask) => (VPEXPANDWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(ExpandInt16x16 x mask) => (VPEXPANDWMasked256 x (VPMOVVec16x16ToM <types.TypeMask> mask))
+(ExpandInt16x32 x mask) => (VPEXPANDWMasked512 x (VPMOVVec16x32ToM <types.TypeMask> mask))
+(ExpandInt32x4 x mask) => (VPEXPANDDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(ExpandInt32x8 x mask) => (VPEXPANDDMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(ExpandInt32x16 x mask) => (VPEXPANDDMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ExpandInt64x2 x mask) => (VPEXPANDQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(ExpandInt64x4 x mask) => (VPEXPANDQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(ExpandInt64x8 x mask) => (VPEXPANDQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ExpandUint8x16 x mask) => (VPEXPANDBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(ExpandUint8x32 x mask) => (VPEXPANDBMasked256 x (VPMOVVec8x32ToM <types.TypeMask> mask))
+(ExpandUint8x64 x mask) => (VPEXPANDBMasked512 x (VPMOVVec8x64ToM <types.TypeMask> mask))
+(ExpandUint16x8 x mask) => (VPEXPANDWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(ExpandUint16x16 x mask) => (VPEXPANDWMasked256 x (VPMOVVec16x16ToM <types.TypeMask> mask))
+(ExpandUint16x32 x mask) => (VPEXPANDWMasked512 x (VPMOVVec16x32ToM <types.TypeMask> mask))
+(ExpandUint32x4 x mask) => (VPEXPANDDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(ExpandUint32x8 x mask) => (VPEXPANDDMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(ExpandUint32x16 x mask) => (VPEXPANDDMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ExpandUint64x2 x mask) => (VPEXPANDQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(ExpandUint64x4 x mask) => (VPEXPANDQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(ExpandUint64x8 x mask) => (VPEXPANDQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (FloorFloat32x4 x) => (VROUNDPS128 [1] x)
 (FloorFloat32x8 x) => (VROUNDPS256 [1] x)
 (FloorFloat64x2 x) => (VROUNDPD128 [1] x)
index 591f8a5bcafb589176ddfef75e56428fc2ff38db..ccda39f59d33d7a178ccdbf3c4067d8bb32a73d2 100644 (file)
@@ -49,6 +49,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                {name: "VDIVPSMasked128", argLength: 3, reg: w2kw, asm: "VDIVPS", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VDIVPSMasked256", argLength: 3, reg: w2kw, asm: "VDIVPS", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VDIVPSMasked512", argLength: 3, reg: w2kw, asm: "VDIVPS", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VEXPANDPDMasked128", argLength: 2, reg: wkw, asm: "VEXPANDPD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VEXPANDPDMasked256", argLength: 2, reg: wkw, asm: "VEXPANDPD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VEXPANDPDMasked512", argLength: 2, reg: wkw, asm: "VEXPANDPD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VEXPANDPSMasked128", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VEXPANDPSMasked256", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VEXPANDPSMasked512", argLength: 2, reg: wkw, asm: "VEXPANDPS", commutative: false, typ: "Vec512", resultInArg0: false},
                {name: "VFMADD213PD128", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec128", resultInArg0: true},
                {name: "VFMADD213PD256", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec256", resultInArg0: true},
                {name: "VFMADD213PD512", argLength: 3, reg: w31, asm: "VFMADD213PD", commutative: false, typ: "Vec512", resultInArg0: true},
@@ -357,6 +363,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                {name: "VPERMWMasked128", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPERMWMasked256", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPERMWMasked512", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPEXPANDBMasked128", argLength: 2, reg: wkw, asm: "VPEXPANDB", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPEXPANDBMasked256", argLength: 2, reg: wkw, asm: "VPEXPANDB", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPEXPANDBMasked512", argLength: 2, reg: wkw, asm: "VPEXPANDB", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPEXPANDDMasked128", argLength: 2, reg: wkw, asm: "VPEXPANDD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPEXPANDDMasked256", argLength: 2, reg: wkw, asm: "VPEXPANDD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPEXPANDDMasked512", argLength: 2, reg: wkw, asm: "VPEXPANDD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPEXPANDQMasked128", argLength: 2, reg: wkw, asm: "VPEXPANDQ", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPEXPANDQMasked256", argLength: 2, reg: wkw, asm: "VPEXPANDQ", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPEXPANDQMasked512", argLength: 2, reg: wkw, asm: "VPEXPANDQ", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPEXPANDWMasked128", argLength: 2, reg: wkw, asm: "VPEXPANDW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPEXPANDWMasked256", argLength: 2, reg: wkw, asm: "VPEXPANDW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPEXPANDWMasked512", argLength: 2, reg: wkw, asm: "VPEXPANDW", commutative: false, typ: "Vec512", resultInArg0: false},
                {name: "VPHADDD128", argLength: 2, reg: v21, asm: "VPHADDD", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPHADDD256", argLength: 2, reg: v21, asm: "VPHADDD", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPHADDSW128", argLength: 2, reg: v21, asm: "VPHADDSW", commutative: false, typ: "Vec128", resultInArg0: false},
index e132b058a4a0b69d9a5516f3164c5a0f973bf894..d0a4a494b181c0a7475c7a862184266e356784d3 100644 (file)
@@ -364,6 +364,36 @@ func simdGenericOps() []opData {
                {name: "EqualUint64x2", argLength: 2, commutative: true},
                {name: "EqualUint64x4", argLength: 2, commutative: true},
                {name: "EqualUint64x8", argLength: 2, commutative: true},
+               {name: "ExpandFloat32x4", argLength: 2, commutative: false},
+               {name: "ExpandFloat32x8", argLength: 2, commutative: false},
+               {name: "ExpandFloat32x16", argLength: 2, commutative: false},
+               {name: "ExpandFloat64x2", argLength: 2, commutative: false},
+               {name: "ExpandFloat64x4", argLength: 2, commutative: false},
+               {name: "ExpandFloat64x8", argLength: 2, commutative: false},
+               {name: "ExpandInt8x16", argLength: 2, commutative: false},
+               {name: "ExpandInt8x32", argLength: 2, commutative: false},
+               {name: "ExpandInt8x64", argLength: 2, commutative: false},
+               {name: "ExpandInt16x8", argLength: 2, commutative: false},
+               {name: "ExpandInt16x16", argLength: 2, commutative: false},
+               {name: "ExpandInt16x32", argLength: 2, commutative: false},
+               {name: "ExpandInt32x4", argLength: 2, commutative: false},
+               {name: "ExpandInt32x8", argLength: 2, commutative: false},
+               {name: "ExpandInt32x16", argLength: 2, commutative: false},
+               {name: "ExpandInt64x2", argLength: 2, commutative: false},
+               {name: "ExpandInt64x4", argLength: 2, commutative: false},
+               {name: "ExpandInt64x8", argLength: 2, commutative: false},
+               {name: "ExpandUint8x16", argLength: 2, commutative: false},
+               {name: "ExpandUint8x32", argLength: 2, commutative: false},
+               {name: "ExpandUint8x64", argLength: 2, commutative: false},
+               {name: "ExpandUint16x8", argLength: 2, commutative: false},
+               {name: "ExpandUint16x16", argLength: 2, commutative: false},
+               {name: "ExpandUint16x32", argLength: 2, commutative: false},
+               {name: "ExpandUint32x4", argLength: 2, commutative: false},
+               {name: "ExpandUint32x8", argLength: 2, commutative: false},
+               {name: "ExpandUint32x16", argLength: 2, commutative: false},
+               {name: "ExpandUint64x2", argLength: 2, commutative: false},
+               {name: "ExpandUint64x4", argLength: 2, commutative: false},
+               {name: "ExpandUint64x8", argLength: 2, commutative: false},
                {name: "FloorFloat32x4", argLength: 1, commutative: false},
                {name: "FloorFloat32x8", argLength: 1, commutative: false},
                {name: "FloorFloat64x2", argLength: 1, commutative: false},
index b39311cd90f105eecdca27dbafb94c293d008798..2fafe10ea517b94907f045da0a4f08bedbe562f4 100644 (file)
@@ -1268,6 +1268,12 @@ const (
        OpAMD64VDIVPSMasked128
        OpAMD64VDIVPSMasked256
        OpAMD64VDIVPSMasked512
+       OpAMD64VEXPANDPDMasked128
+       OpAMD64VEXPANDPDMasked256
+       OpAMD64VEXPANDPDMasked512
+       OpAMD64VEXPANDPSMasked128
+       OpAMD64VEXPANDPSMasked256
+       OpAMD64VEXPANDPSMasked512
        OpAMD64VFMADD213PD128
        OpAMD64VFMADD213PD256
        OpAMD64VFMADD213PD512
@@ -1576,6 +1582,18 @@ const (
        OpAMD64VPERMWMasked128
        OpAMD64VPERMWMasked256
        OpAMD64VPERMWMasked512
+       OpAMD64VPEXPANDBMasked128
+       OpAMD64VPEXPANDBMasked256
+       OpAMD64VPEXPANDBMasked512
+       OpAMD64VPEXPANDDMasked128
+       OpAMD64VPEXPANDDMasked256
+       OpAMD64VPEXPANDDMasked512
+       OpAMD64VPEXPANDQMasked128
+       OpAMD64VPEXPANDQMasked256
+       OpAMD64VPEXPANDQMasked512
+       OpAMD64VPEXPANDWMasked128
+       OpAMD64VPEXPANDWMasked256
+       OpAMD64VPEXPANDWMasked512
        OpAMD64VPHADDD128
        OpAMD64VPHADDD256
        OpAMD64VPHADDSW128
@@ -4925,6 +4943,36 @@ const (
        OpEqualUint64x2
        OpEqualUint64x4
        OpEqualUint64x8
+       OpExpandFloat32x4
+       OpExpandFloat32x8
+       OpExpandFloat32x16
+       OpExpandFloat64x2
+       OpExpandFloat64x4
+       OpExpandFloat64x8
+       OpExpandInt8x16
+       OpExpandInt8x32
+       OpExpandInt8x64
+       OpExpandInt16x8
+       OpExpandInt16x16
+       OpExpandInt16x32
+       OpExpandInt32x4
+       OpExpandInt32x8
+       OpExpandInt32x16
+       OpExpandInt64x2
+       OpExpandInt64x4
+       OpExpandInt64x8
+       OpExpandUint8x16
+       OpExpandUint8x32
+       OpExpandUint8x64
+       OpExpandUint16x8
+       OpExpandUint16x16
+       OpExpandUint16x32
+       OpExpandUint32x4
+       OpExpandUint32x8
+       OpExpandUint32x16
+       OpExpandUint64x2
+       OpExpandUint64x4
+       OpExpandUint64x8
        OpFloorFloat32x4
        OpFloorFloat32x8
        OpFloorFloat64x2
@@ -20065,6 +20113,90 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "VEXPANDPDMasked128",
+               argLen: 2,
+               asm:    x86.AVEXPANDPD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VEXPANDPDMasked256",
+               argLen: 2,
+               asm:    x86.AVEXPANDPD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VEXPANDPDMasked512",
+               argLen: 2,
+               asm:    x86.AVEXPANDPD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VEXPANDPSMasked128",
+               argLen: 2,
+               asm:    x86.AVEXPANDPS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VEXPANDPSMasked256",
+               argLen: 2,
+               asm:    x86.AVEXPANDPS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VEXPANDPSMasked512",
+               argLen: 2,
+               asm:    x86.AVEXPANDPS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:         "VFMADD213PD128",
                argLen:       3,
@@ -24788,6 +24920,174 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "VPEXPANDBMasked128",
+               argLen: 2,
+               asm:    x86.AVPEXPANDB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDBMasked256",
+               argLen: 2,
+               asm:    x86.AVPEXPANDB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDBMasked512",
+               argLen: 2,
+               asm:    x86.AVPEXPANDB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDDMasked128",
+               argLen: 2,
+               asm:    x86.AVPEXPANDD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDDMasked256",
+               argLen: 2,
+               asm:    x86.AVPEXPANDD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDDMasked512",
+               argLen: 2,
+               asm:    x86.AVPEXPANDD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDQMasked128",
+               argLen: 2,
+               asm:    x86.AVPEXPANDQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDQMasked256",
+               argLen: 2,
+               asm:    x86.AVPEXPANDQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDQMasked512",
+               argLen: 2,
+               asm:    x86.AVPEXPANDQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDWMasked128",
+               argLen: 2,
+               asm:    x86.AVPEXPANDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDWMasked256",
+               argLen: 2,
+               asm:    x86.AVPEXPANDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPEXPANDWMasked512",
+               argLen: 2,
+               asm:    x86.AVPEXPANDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPHADDD128",
                argLen: 2,
@@ -64829,6 +65129,156 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "ExpandFloat32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandFloat32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandFloat32x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandFloat64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandFloat64x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandFloat64x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt8x64",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt16x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt32x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt64x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandInt64x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint8x64",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint16x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint32x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint64x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ExpandUint64x8",
+               argLen:  2,
+               generic: true,
+       },
        {
                name:    "FloorFloat32x4",
                argLen:  1,
index 91fd3fb470f080e9b087c00624ae4f490b8e467a..6b63b7024597fc9653a3fc05896fb62e8aa4fec1 100644 (file)
@@ -1754,6 +1754,66 @@ func rewriteValueAMD64(v *Value) bool {
                return true
        case OpEqualUint8x64:
                return rewriteValueAMD64_OpEqualUint8x64(v)
+       case OpExpandFloat32x16:
+               return rewriteValueAMD64_OpExpandFloat32x16(v)
+       case OpExpandFloat32x4:
+               return rewriteValueAMD64_OpExpandFloat32x4(v)
+       case OpExpandFloat32x8:
+               return rewriteValueAMD64_OpExpandFloat32x8(v)
+       case OpExpandFloat64x2:
+               return rewriteValueAMD64_OpExpandFloat64x2(v)
+       case OpExpandFloat64x4:
+               return rewriteValueAMD64_OpExpandFloat64x4(v)
+       case OpExpandFloat64x8:
+               return rewriteValueAMD64_OpExpandFloat64x8(v)
+       case OpExpandInt16x16:
+               return rewriteValueAMD64_OpExpandInt16x16(v)
+       case OpExpandInt16x32:
+               return rewriteValueAMD64_OpExpandInt16x32(v)
+       case OpExpandInt16x8:
+               return rewriteValueAMD64_OpExpandInt16x8(v)
+       case OpExpandInt32x16:
+               return rewriteValueAMD64_OpExpandInt32x16(v)
+       case OpExpandInt32x4:
+               return rewriteValueAMD64_OpExpandInt32x4(v)
+       case OpExpandInt32x8:
+               return rewriteValueAMD64_OpExpandInt32x8(v)
+       case OpExpandInt64x2:
+               return rewriteValueAMD64_OpExpandInt64x2(v)
+       case OpExpandInt64x4:
+               return rewriteValueAMD64_OpExpandInt64x4(v)
+       case OpExpandInt64x8:
+               return rewriteValueAMD64_OpExpandInt64x8(v)
+       case OpExpandInt8x16:
+               return rewriteValueAMD64_OpExpandInt8x16(v)
+       case OpExpandInt8x32:
+               return rewriteValueAMD64_OpExpandInt8x32(v)
+       case OpExpandInt8x64:
+               return rewriteValueAMD64_OpExpandInt8x64(v)
+       case OpExpandUint16x16:
+               return rewriteValueAMD64_OpExpandUint16x16(v)
+       case OpExpandUint16x32:
+               return rewriteValueAMD64_OpExpandUint16x32(v)
+       case OpExpandUint16x8:
+               return rewriteValueAMD64_OpExpandUint16x8(v)
+       case OpExpandUint32x16:
+               return rewriteValueAMD64_OpExpandUint32x16(v)
+       case OpExpandUint32x4:
+               return rewriteValueAMD64_OpExpandUint32x4(v)
+       case OpExpandUint32x8:
+               return rewriteValueAMD64_OpExpandUint32x8(v)
+       case OpExpandUint64x2:
+               return rewriteValueAMD64_OpExpandUint64x2(v)
+       case OpExpandUint64x4:
+               return rewriteValueAMD64_OpExpandUint64x4(v)
+       case OpExpandUint64x8:
+               return rewriteValueAMD64_OpExpandUint64x8(v)
+       case OpExpandUint8x16:
+               return rewriteValueAMD64_OpExpandUint8x16(v)
+       case OpExpandUint8x32:
+               return rewriteValueAMD64_OpExpandUint8x32(v)
+       case OpExpandUint8x64:
+               return rewriteValueAMD64_OpExpandUint8x64(v)
        case OpFMA:
                return rewriteValueAMD64_OpFMA(v)
        case OpFloor:
@@ -34479,6 +34539,486 @@ func rewriteValueAMD64_OpEqualUint8x64(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpExpandFloat32x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandFloat32x16 x mask)
+       // result: (VEXPANDPSMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VEXPANDPSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandFloat32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandFloat32x4 x mask)
+       // result: (VEXPANDPSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VEXPANDPSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandFloat32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandFloat32x8 x mask)
+       // result: (VEXPANDPSMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VEXPANDPSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandFloat64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandFloat64x2 x mask)
+       // result: (VEXPANDPDMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VEXPANDPDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandFloat64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandFloat64x4 x mask)
+       // result: (VEXPANDPDMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VEXPANDPDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandFloat64x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandFloat64x8 x mask)
+       // result: (VEXPANDPDMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VEXPANDPDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt16x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt16x16 x mask)
+       // result: (VPEXPANDWMasked256 x (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt16x32(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt16x32 x mask)
+       // result: (VPEXPANDWMasked512 x (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt16x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt16x8 x mask)
+       // result: (VPEXPANDWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt32x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt32x16 x mask)
+       // result: (VPEXPANDDMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt32x4 x mask)
+       // result: (VPEXPANDDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt32x8 x mask)
+       // result: (VPEXPANDDMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt64x2 x mask)
+       // result: (VPEXPANDQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDQMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt64x4 x mask)
+       // result: (VPEXPANDQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDQMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt64x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt64x8 x mask)
+       // result: (VPEXPANDQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDQMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt8x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt8x16 x mask)
+       // result: (VPEXPANDBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt8x32(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt8x32 x mask)
+       // result: (VPEXPANDBMasked256 x (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandInt8x64(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandInt8x64 x mask)
+       // result: (VPEXPANDBMasked512 x (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint16x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint16x16 x mask)
+       // result: (VPEXPANDWMasked256 x (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint16x32(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint16x32 x mask)
+       // result: (VPEXPANDWMasked512 x (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint16x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint16x8 x mask)
+       // result: (VPEXPANDWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint32x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint32x16 x mask)
+       // result: (VPEXPANDDMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint32x4 x mask)
+       // result: (VPEXPANDDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint32x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint32x8 x mask)
+       // result: (VPEXPANDDMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint64x2 x mask)
+       // result: (VPEXPANDQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDQMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint64x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint64x4 x mask)
+       // result: (VPEXPANDQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDQMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint64x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint64x8 x mask)
+       // result: (VPEXPANDQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDQMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint8x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint8x16 x mask)
+       // result: (VPEXPANDBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint8x32(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint8x32 x mask)
+       // result: (VPEXPANDBMasked256 x (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpExpandUint8x64(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (ExpandUint8x64 x mask)
+       // result: (VPEXPANDBMasked512 x (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPEXPANDBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
 func rewriteValueAMD64_OpFMA(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
index 873bb8e2de17c90b309481dc7f1b137dd61e5345..0f65b4500a16863775dc7079988b36f45401bc6c 100644 (file)
@@ -396,6 +396,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Uint64x2.EqualMasked", opLen3(ssa.OpEqualMaskedUint64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint64x4.EqualMasked", opLen3(ssa.OpEqualMaskedUint64x4, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Uint64x8.EqualMasked", opLen3(ssa.OpEqualMaskedUint64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.Expand", opLen2(ssa.OpExpandFloat32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x8.Expand", opLen2(ssa.OpExpandFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x16.Expand", opLen2(ssa.OpExpandFloat32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x2.Expand", opLen2(ssa.OpExpandFloat64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x4.Expand", opLen2(ssa.OpExpandFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x8.Expand", opLen2(ssa.OpExpandFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.Expand", opLen2(ssa.OpExpandInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.Expand", opLen2(ssa.OpExpandInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.Expand", opLen2(ssa.OpExpandInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.Expand", opLen2(ssa.OpExpandInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.Expand", opLen2(ssa.OpExpandInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.Expand", opLen2(ssa.OpExpandInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.Expand", opLen2(ssa.OpExpandInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.Expand", opLen2(ssa.OpExpandInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x16.Expand", opLen2(ssa.OpExpandInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x2.Expand", opLen2(ssa.OpExpandInt64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x4.Expand", opLen2(ssa.OpExpandInt64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x8.Expand", opLen2(ssa.OpExpandInt64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Expand", opLen2(ssa.OpExpandUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x32.Expand", opLen2(ssa.OpExpandUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x64.Expand", opLen2(ssa.OpExpandUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Expand", opLen2(ssa.OpExpandUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x16.Expand", opLen2(ssa.OpExpandUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x32.Expand", opLen2(ssa.OpExpandUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Expand", opLen2(ssa.OpExpandUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x8.Expand", opLen2(ssa.OpExpandUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x16.Expand", opLen2(ssa.OpExpandUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Expand", opLen2(ssa.OpExpandUint64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint64x4.Expand", opLen2(ssa.OpExpandUint64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x8.Expand", opLen2(ssa.OpExpandUint64x8, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Float32x4.Floor", opLen1(ssa.OpFloorFloat32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float32x8.Floor", opLen1(ssa.OpFloorFloat32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Float64x2.Floor", opLen1(ssa.OpFloorFloat64x2, types.TypeVec128), sys.AMD64)
index 5eb8fea47691a25292109ca490fb7f6550534b50..2138271769db588c37d7a241597b2da83d4d2c3a 100644 (file)
@@ -2399,6 +2399,188 @@ func (x Uint64x4) EqualMasked(y Uint64x4, mask Mask64x4) Mask64x4
 // Asm: VPCMPUQ, CPU Feature: AVX512F
 func (x Uint64x8) EqualMasked(y Uint64x8, mask Mask64x8) Mask64x8
 
+/* Expand */
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPS, CPU Feature: AVX512F
+func (x Float32x4) Expand(mask Mask32x4) Float32x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPS, CPU Feature: AVX512F
+func (x Float32x8) Expand(mask Mask32x8) Float32x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPS, CPU Feature: AVX512F
+func (x Float32x16) Expand(mask Mask32x16) Float32x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPD, CPU Feature: AVX512F
+func (x Float64x2) Expand(mask Mask64x2) Float64x2
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPD, CPU Feature: AVX512F
+func (x Float64x4) Expand(mask Mask64x4) Float64x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPD, CPU Feature: AVX512F
+func (x Float64x8) Expand(mask Mask64x8) Float64x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Int8x16) Expand(mask Mask8x16) Int8x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Int8x32) Expand(mask Mask8x32) Int8x32
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Int8x64) Expand(mask Mask8x64) Int8x64
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Int16x8) Expand(mask Mask16x8) Int16x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Int16x16) Expand(mask Mask16x16) Int16x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Int16x32) Expand(mask Mask16x32) Int16x32
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512F
+func (x Int32x4) Expand(mask Mask32x4) Int32x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512F
+func (x Int32x8) Expand(mask Mask32x8) Int32x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512F
+func (x Int32x16) Expand(mask Mask32x16) Int32x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512F
+func (x Int64x2) Expand(mask Mask64x2) Int64x2
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512F
+func (x Int64x4) Expand(mask Mask64x4) Int64x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512F
+func (x Int64x8) Expand(mask Mask64x8) Int64x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Uint8x16) Expand(mask Mask8x16) Uint8x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Uint8x32) Expand(mask Mask8x32) Uint8x32
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Uint8x64) Expand(mask Mask8x64) Uint8x64
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) Expand(mask Mask16x8) Uint16x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) Expand(mask Mask16x16) Uint16x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) Expand(mask Mask16x32) Uint16x32
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512F
+func (x Uint32x4) Expand(mask Mask32x4) Uint32x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512F
+func (x Uint32x8) Expand(mask Mask32x8) Uint32x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512F
+func (x Uint32x16) Expand(mask Mask32x16) Uint32x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512F
+func (x Uint64x2) Expand(mask Mask64x2) Uint64x2
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512F
+func (x Uint64x4) Expand(mask Mask64x4) Uint64x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512F
+func (x Uint64x8) Expand(mask Mask64x8) Uint64x8
+
 /* Floor */
 
 // Floor rounds elements down to the nearest integer.
index 571834783887b455c043a76870d066449e55f2d9..9e9b45b5b8e42264338a3bc3d06715a1bbd27d96 100644 (file)
@@ -187,6 +187,22 @@ func TestCompress(t *testing.T) {
        }
 }
 
+func TestExpand(t *testing.T) {
+       if !simd.HasAVX512() {
+               t.Skip("Test requires HasAVX512, not available on this hardware")
+               return
+       }
+       v3400 := simd.LoadInt32x4Slice([]int32{3, 4, 0, 0})
+       v0101 := simd.LoadInt32x4Slice([]int32{0, -1, 0, -1})
+       v2400 := v3400.Expand(v0101.AsMask32x4())
+       got := make([]int32, 4)
+       v2400.StoreSlice(got)
+       want := []int32{0, 3, 0, 4}
+       if !slices.Equal(got, want) {
+               t.Errorf("want and got differ, want=%v, got=%v", want, got)
+       }
+}
+
 func TestPairDotProdAccumulate(t *testing.T) {
        if !simd.HasAVX512GFNI() {
                // TODO: this function is actually VNNI, let's implement and call the right check.