]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] simd, cmd/compile: generated code for Broadcast
authorDavid Chase <drchase@google.com>
Tue, 5 Aug 2025 21:34:05 +0000 (17:34 -0400)
committerDavid Chase <drchase@google.com>
Wed, 13 Aug 2025 18:48:29 +0000 (11:48 -0700)
Generated by simdgen CL 693599

This turned out to require some additional work in
other places, including filling in missing
methods (use OverwriteBase to get FP versions).

Also includes a test.

Change-Id: I2efe8967837834745f9cae661d4d4dcbb5390b6f
Reviewed-on: https://go-review.googlesource.com/c/go/+/693758
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
src/cmd/compile/internal/amd64/simdssa.go
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/cmd/compile/internal/ssagen/simdintrinsics.go
src/simd/genfiles.go
src/simd/ops_amd64.go
src/simd/simd_test.go
src/simd/slice_amd64.go

index e6bbdc03def1e44987e4cabb43873d10406c7c55..73a947a88af24a6fc3b93b5ef9a9ccc3a2d1565d 100644 (file)
@@ -24,6 +24,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPABSQ128,
                ssa.OpAMD64VPABSQ256,
                ssa.OpAMD64VPABSQ512,
+               ssa.OpAMD64VBROADCASTSS128,
+               ssa.OpAMD64VPBROADCASTQ128,
+               ssa.OpAMD64VPBROADCASTB128,
+               ssa.OpAMD64VPBROADCASTW128,
+               ssa.OpAMD64VPBROADCASTD128,
+               ssa.OpAMD64VBROADCASTSS256,
+               ssa.OpAMD64VBROADCASTSD256,
+               ssa.OpAMD64VPBROADCASTB256,
+               ssa.OpAMD64VPBROADCASTW256,
+               ssa.OpAMD64VPBROADCASTD256,
+               ssa.OpAMD64VPBROADCASTQ256,
+               ssa.OpAMD64VBROADCASTSS512,
+               ssa.OpAMD64VBROADCASTSD512,
+               ssa.OpAMD64VPBROADCASTB512,
+               ssa.OpAMD64VPBROADCASTW512,
+               ssa.OpAMD64VPBROADCASTD512,
+               ssa.OpAMD64VPBROADCASTQ512,
                ssa.OpAMD64VCVTTPS2DQ128,
                ssa.OpAMD64VCVTTPS2DQ256,
                ssa.OpAMD64VCVTTPS2DQ512,
@@ -624,6 +641,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPABSQMasked128,
                ssa.OpAMD64VPABSQMasked256,
                ssa.OpAMD64VPABSQMasked512,
+               ssa.OpAMD64VBROADCASTSSMasked128,
+               ssa.OpAMD64VPBROADCASTQMasked128,
+               ssa.OpAMD64VPBROADCASTBMasked128,
+               ssa.OpAMD64VPBROADCASTWMasked128,
+               ssa.OpAMD64VPBROADCASTDMasked128,
+               ssa.OpAMD64VBROADCASTSSMasked256,
+               ssa.OpAMD64VBROADCASTSDMasked256,
+               ssa.OpAMD64VPBROADCASTBMasked256,
+               ssa.OpAMD64VPBROADCASTWMasked256,
+               ssa.OpAMD64VPBROADCASTDMasked256,
+               ssa.OpAMD64VPBROADCASTQMasked256,
+               ssa.OpAMD64VBROADCASTSSMasked512,
+               ssa.OpAMD64VBROADCASTSDMasked512,
+               ssa.OpAMD64VPBROADCASTBMasked512,
+               ssa.OpAMD64VPBROADCASTWMasked512,
+               ssa.OpAMD64VPBROADCASTDMasked512,
+               ssa.OpAMD64VPBROADCASTQMasked512,
                ssa.OpAMD64VCOMPRESSPSMasked128,
                ssa.OpAMD64VCOMPRESSPSMasked256,
                ssa.OpAMD64VCOMPRESSPSMasked512,
@@ -1104,10 +1138,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPSRLQMasked512:
                p = simdVfpkv(s, v)
 
-       case ssa.OpAMD64VPINSRB128,
-               ssa.OpAMD64VPINSRW128,
-               ssa.OpAMD64VPINSRD128,
-               ssa.OpAMD64VPINSRQ128:
+       case ssa.OpAMD64VPINSRD128,
+               ssa.OpAMD64VPINSRQ128,
+               ssa.OpAMD64VPINSRB128,
+               ssa.OpAMD64VPINSRW128:
                p = simdVgpvImm8(s, v)
 
        case ssa.OpAMD64VPEXTRB128,
@@ -1221,6 +1255,23 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPAVGWMasked128,
                ssa.OpAMD64VPAVGWMasked256,
                ssa.OpAMD64VPAVGWMasked512,
+               ssa.OpAMD64VBROADCASTSSMasked128,
+               ssa.OpAMD64VPBROADCASTQMasked128,
+               ssa.OpAMD64VPBROADCASTBMasked128,
+               ssa.OpAMD64VPBROADCASTWMasked128,
+               ssa.OpAMD64VPBROADCASTDMasked128,
+               ssa.OpAMD64VBROADCASTSSMasked256,
+               ssa.OpAMD64VBROADCASTSDMasked256,
+               ssa.OpAMD64VPBROADCASTBMasked256,
+               ssa.OpAMD64VPBROADCASTWMasked256,
+               ssa.OpAMD64VPBROADCASTDMasked256,
+               ssa.OpAMD64VPBROADCASTQMasked256,
+               ssa.OpAMD64VBROADCASTSSMasked512,
+               ssa.OpAMD64VBROADCASTSDMasked512,
+               ssa.OpAMD64VPBROADCASTBMasked512,
+               ssa.OpAMD64VPBROADCASTWMasked512,
+               ssa.OpAMD64VPBROADCASTDMasked512,
+               ssa.OpAMD64VPBROADCASTQMasked512,
                ssa.OpAMD64VRNDSCALEPSMasked128,
                ssa.OpAMD64VRNDSCALEPSMasked256,
                ssa.OpAMD64VRNDSCALEPSMasked512,
index 80cddaae79e2a141448be646838053f66fcf38ea..e7c5a1a97d372d7ba20a5ef542d98b3779013cd0 100644 (file)
 (AverageMaskedUint16x8 x y mask) => (VPAVGWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
 (AverageMaskedUint16x16 x y mask) => (VPAVGWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (AverageMaskedUint16x32 x y mask) => (VPAVGWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(Broadcast128Float32x4 ...) => (VBROADCASTSS128 ...)
+(Broadcast128Float64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast128Int8x16 ...) => (VPBROADCASTB128 ...)
+(Broadcast128Int16x8 ...) => (VPBROADCASTW128 ...)
+(Broadcast128Int32x4 ...) => (VPBROADCASTD128 ...)
+(Broadcast128Int64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast128Uint8x16 ...) => (VPBROADCASTB128 ...)
+(Broadcast128Uint16x8 ...) => (VPBROADCASTW128 ...)
+(Broadcast128Uint32x4 ...) => (VPBROADCASTD128 ...)
+(Broadcast128Uint64x2 ...) => (VPBROADCASTQ128 ...)
+(Broadcast128MaskedFloat32x4 x mask) => (VBROADCASTSSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast128MaskedFloat64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Broadcast128MaskedInt8x16 x mask) => (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Broadcast128MaskedInt16x8 x mask) => (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Broadcast128MaskedInt32x4 x mask) => (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast128MaskedInt64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Broadcast128MaskedUint8x16 x mask) => (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Broadcast128MaskedUint16x8 x mask) => (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Broadcast128MaskedUint32x4 x mask) => (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast128MaskedUint64x2 x mask) => (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Broadcast256Float32x4 ...) => (VBROADCASTSS256 ...)
+(Broadcast256Float64x2 ...) => (VBROADCASTSD256 ...)
+(Broadcast256Int8x16 ...) => (VPBROADCASTB256 ...)
+(Broadcast256Int16x8 ...) => (VPBROADCASTW256 ...)
+(Broadcast256Int32x4 ...) => (VPBROADCASTD256 ...)
+(Broadcast256Int64x2 ...) => (VPBROADCASTQ256 ...)
+(Broadcast256Uint8x16 ...) => (VPBROADCASTB256 ...)
+(Broadcast256Uint16x8 ...) => (VPBROADCASTW256 ...)
+(Broadcast256Uint32x4 ...) => (VPBROADCASTD256 ...)
+(Broadcast256Uint64x2 ...) => (VPBROADCASTQ256 ...)
+(Broadcast256MaskedFloat32x4 x mask) => (VBROADCASTSSMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast256MaskedFloat64x2 x mask) => (VBROADCASTSDMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Broadcast256MaskedInt8x16 x mask) => (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Broadcast256MaskedInt16x8 x mask) => (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Broadcast256MaskedInt32x4 x mask) => (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast256MaskedInt64x2 x mask) => (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Broadcast256MaskedUint8x16 x mask) => (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Broadcast256MaskedUint16x8 x mask) => (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Broadcast256MaskedUint32x4 x mask) => (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast256MaskedUint64x2 x mask) => (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Broadcast512Float32x4 ...) => (VBROADCASTSS512 ...)
+(Broadcast512Float64x2 ...) => (VBROADCASTSD512 ...)
+(Broadcast512Int8x16 ...) => (VPBROADCASTB512 ...)
+(Broadcast512Int16x8 ...) => (VPBROADCASTW512 ...)
+(Broadcast512Int32x4 ...) => (VPBROADCASTD512 ...)
+(Broadcast512Int64x2 ...) => (VPBROADCASTQ512 ...)
+(Broadcast512Uint8x16 ...) => (VPBROADCASTB512 ...)
+(Broadcast512Uint16x8 ...) => (VPBROADCASTW512 ...)
+(Broadcast512Uint32x4 ...) => (VPBROADCASTD512 ...)
+(Broadcast512Uint64x2 ...) => (VPBROADCASTQ512 ...)
+(Broadcast512MaskedFloat32x4 x mask) => (VBROADCASTSSMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast512MaskedFloat64x2 x mask) => (VBROADCASTSDMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Broadcast512MaskedInt8x16 x mask) => (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Broadcast512MaskedInt16x8 x mask) => (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Broadcast512MaskedInt32x4 x mask) => (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast512MaskedInt64x2 x mask) => (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Broadcast512MaskedUint8x16 x mask) => (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Broadcast512MaskedUint16x8 x mask) => (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Broadcast512MaskedUint32x4 x mask) => (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Broadcast512MaskedUint64x2 x mask) => (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (CeilFloat32x4 x) => (VROUNDPS128 [2] x)
 (CeilFloat32x8 x) => (VROUNDPS256 [2] x)
 (CeilFloat64x2 x) => (VROUNDPD128 [2] x)
 (ScaleMaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 (ScaleMaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (ScaleMaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(SetElemFloat32x4 ...) => (VPINSRD128 ...)
+(SetElemFloat64x2 ...) => (VPINSRQ128 ...)
 (SetElemInt8x16 ...) => (VPINSRB128 ...)
 (SetElemInt16x8 ...) => (VPINSRW128 ...)
 (SetElemInt32x4 ...) => (VPINSRD128 ...)
index afea4c0a46e4dbffd57b1d66cac58c6503d40321..5d388a4531ba4ba4b0657e8485a5a483d5cbc8e9 100644 (file)
@@ -20,6 +20,16 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                {name: "VADDSUBPD256", argLength: 2, reg: v21, asm: "VADDSUBPD", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VADDSUBPS128", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VADDSUBPS256", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VBROADCASTSD256", argLength: 1, reg: v11, asm: "VBROADCASTSD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VBROADCASTSD512", argLength: 1, reg: w11, asm: "VBROADCASTSD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VBROADCASTSDMasked256", argLength: 2, reg: wkw, asm: "VBROADCASTSD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VBROADCASTSDMasked512", argLength: 2, reg: wkw, asm: "VBROADCASTSD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VBROADCASTSS128", argLength: 1, reg: v11, asm: "VBROADCASTSS", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VBROADCASTSS256", argLength: 1, reg: v11, asm: "VBROADCASTSS", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VBROADCASTSS512", argLength: 1, reg: w11, asm: "VBROADCASTSS", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VBROADCASTSSMasked128", argLength: 2, reg: wkw, asm: "VBROADCASTSS", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VBROADCASTSSMasked256", argLength: 2, reg: wkw, asm: "VBROADCASTSS", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VBROADCASTSSMasked512", argLength: 2, reg: wkw, asm: "VBROADCASTSS", commutative: false, typ: "Vec512", resultInArg0: false},
                {name: "VCOMPRESSPDMasked128", argLength: 2, reg: wkw, asm: "VCOMPRESSPD", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VCOMPRESSPDMasked256", argLength: 2, reg: wkw, asm: "VCOMPRESSPD", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VCOMPRESSPDMasked512", argLength: 2, reg: wkw, asm: "VCOMPRESSPD", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -252,6 +262,30 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                {name: "VPBLENDMWMasked512", argLength: 3, reg: w2kw, asm: "VPBLENDMW", commutative: false, typ: "Vec512", resultInArg0: false},
                {name: "VPBLENDVB128", argLength: 3, reg: v31, asm: "VPBLENDVB", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPBLENDVB256", argLength: 3, reg: v31, asm: "VPBLENDVB", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTB128", argLength: 1, reg: v11, asm: "VPBROADCASTB", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPBROADCASTB256", argLength: 1, reg: v11, asm: "VPBROADCASTB", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTB512", argLength: 1, reg: w11, asm: "VPBROADCASTB", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPBROADCASTBMasked128", argLength: 2, reg: wkw, asm: "VPBROADCASTB", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPBROADCASTBMasked256", argLength: 2, reg: wkw, asm: "VPBROADCASTB", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTBMasked512", argLength: 2, reg: wkw, asm: "VPBROADCASTB", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPBROADCASTD128", argLength: 1, reg: v11, asm: "VPBROADCASTD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPBROADCASTD256", argLength: 1, reg: v11, asm: "VPBROADCASTD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTD512", argLength: 1, reg: w11, asm: "VPBROADCASTD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPBROADCASTDMasked128", argLength: 2, reg: wkw, asm: "VPBROADCASTD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPBROADCASTDMasked256", argLength: 2, reg: wkw, asm: "VPBROADCASTD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTDMasked512", argLength: 2, reg: wkw, asm: "VPBROADCASTD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPBROADCASTQ128", argLength: 1, reg: v11, asm: "VPBROADCASTQ", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPBROADCASTQ256", argLength: 1, reg: v11, asm: "VPBROADCASTQ", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTQ512", argLength: 1, reg: w11, asm: "VPBROADCASTQ", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPBROADCASTQMasked128", argLength: 2, reg: wkw, asm: "VPBROADCASTQ", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPBROADCASTQMasked256", argLength: 2, reg: wkw, asm: "VPBROADCASTQ", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTQMasked512", argLength: 2, reg: wkw, asm: "VPBROADCASTQ", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPBROADCASTW128", argLength: 1, reg: v11, asm: "VPBROADCASTW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPBROADCASTW256", argLength: 1, reg: v11, asm: "VPBROADCASTW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTW512", argLength: 1, reg: w11, asm: "VPBROADCASTW", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPBROADCASTWMasked128", argLength: 2, reg: wkw, asm: "VPBROADCASTW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPBROADCASTWMasked256", argLength: 2, reg: wkw, asm: "VPBROADCASTW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPBROADCASTWMasked512", argLength: 2, reg: wkw, asm: "VPBROADCASTW", commutative: false, typ: "Vec512", resultInArg0: false},
                {name: "VPCMPEQB128", argLength: 2, reg: v21, asm: "VPCMPEQB", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPCMPEQB256", argLength: 2, reg: v21, asm: "VPCMPEQB", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPCMPEQB512", argLength: 2, reg: w2k, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false},
@@ -1000,10 +1034,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                {name: "VPRORQMasked128", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPRORQMasked256", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPRORQMasked512", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
-               {name: "VPINSRB128", argLength: 2, reg: vgpv, asm: "VPINSRB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
-               {name: "VPINSRW128", argLength: 2, reg: vgpv, asm: "VPINSRW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPINSRD128", argLength: 2, reg: vgpv, asm: "VPINSRD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPINSRQ128", argLength: 2, reg: vgpv, asm: "VPINSRQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPINSRB128", argLength: 2, reg: vgpv, asm: "VPINSRB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPINSRW128", argLength: 2, reg: vgpv, asm: "VPINSRW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VINSERTF128256", argLength: 2, reg: v21, asm: "VINSERTF128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VINSERTF64X4512", argLength: 2, reg: w21, asm: "VINSERTF64X4", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
                {name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
index fea701e174f0bb57f1db4ceb137721f16026ef91..f120dcddd0c0c9e935ca04786528f014ed40b730 100644 (file)
@@ -232,6 +232,66 @@ func simdGenericOps() []opData {
                {name: "AverageUint16x8", argLength: 2, commutative: true},
                {name: "AverageUint16x16", argLength: 2, commutative: true},
                {name: "AverageUint16x32", argLength: 2, commutative: true},
+               {name: "Broadcast128Float32x4", argLength: 1, commutative: false},
+               {name: "Broadcast128Float64x2", argLength: 1, commutative: false},
+               {name: "Broadcast128Int8x16", argLength: 1, commutative: false},
+               {name: "Broadcast128Int16x8", argLength: 1, commutative: false},
+               {name: "Broadcast128Int32x4", argLength: 1, commutative: false},
+               {name: "Broadcast128Int64x2", argLength: 1, commutative: false},
+               {name: "Broadcast128MaskedFloat32x4", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedFloat64x2", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedInt8x16", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedInt16x8", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedInt32x4", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedInt64x2", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedUint8x16", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedUint16x8", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedUint32x4", argLength: 2, commutative: false},
+               {name: "Broadcast128MaskedUint64x2", argLength: 2, commutative: false},
+               {name: "Broadcast128Uint8x16", argLength: 1, commutative: false},
+               {name: "Broadcast128Uint16x8", argLength: 1, commutative: false},
+               {name: "Broadcast128Uint32x4", argLength: 1, commutative: false},
+               {name: "Broadcast128Uint64x2", argLength: 1, commutative: false},
+               {name: "Broadcast256Float32x4", argLength: 1, commutative: false},
+               {name: "Broadcast256Float64x2", argLength: 1, commutative: false},
+               {name: "Broadcast256Int8x16", argLength: 1, commutative: false},
+               {name: "Broadcast256Int16x8", argLength: 1, commutative: false},
+               {name: "Broadcast256Int32x4", argLength: 1, commutative: false},
+               {name: "Broadcast256Int64x2", argLength: 1, commutative: false},
+               {name: "Broadcast256MaskedFloat32x4", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedFloat64x2", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedInt8x16", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedInt16x8", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedInt32x4", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedInt64x2", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedUint8x16", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedUint16x8", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedUint32x4", argLength: 2, commutative: false},
+               {name: "Broadcast256MaskedUint64x2", argLength: 2, commutative: false},
+               {name: "Broadcast256Uint8x16", argLength: 1, commutative: false},
+               {name: "Broadcast256Uint16x8", argLength: 1, commutative: false},
+               {name: "Broadcast256Uint32x4", argLength: 1, commutative: false},
+               {name: "Broadcast256Uint64x2", argLength: 1, commutative: false},
+               {name: "Broadcast512Float32x4", argLength: 1, commutative: false},
+               {name: "Broadcast512Float64x2", argLength: 1, commutative: false},
+               {name: "Broadcast512Int8x16", argLength: 1, commutative: false},
+               {name: "Broadcast512Int16x8", argLength: 1, commutative: false},
+               {name: "Broadcast512Int32x4", argLength: 1, commutative: false},
+               {name: "Broadcast512Int64x2", argLength: 1, commutative: false},
+               {name: "Broadcast512MaskedFloat32x4", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedFloat64x2", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedInt8x16", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedInt16x8", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedInt32x4", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedInt64x2", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedUint8x16", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedUint16x8", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedUint32x4", argLength: 2, commutative: false},
+               {name: "Broadcast512MaskedUint64x2", argLength: 2, commutative: false},
+               {name: "Broadcast512Uint8x16", argLength: 1, commutative: false},
+               {name: "Broadcast512Uint16x8", argLength: 1, commutative: false},
+               {name: "Broadcast512Uint32x4", argLength: 1, commutative: false},
+               {name: "Broadcast512Uint64x2", argLength: 1, commutative: false},
                {name: "CeilFloat32x4", argLength: 1, commutative: false},
                {name: "CeilFloat32x8", argLength: 1, commutative: false},
                {name: "CeilFloat64x2", argLength: 1, commutative: false},
@@ -1812,6 +1872,8 @@ func simdGenericOps() []opData {
                {name: "RoundToEvenScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
                {name: "RoundToEvenScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "UInt8"},
                {name: "RoundToEvenScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "SetElemFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "SetElemFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
                {name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "UInt8"},
                {name: "SetElemInt16x8", argLength: 2, commutative: false, aux: "UInt8"},
                {name: "SetElemInt32x4", argLength: 2, commutative: false, aux: "UInt8"},
index 77527c83b8c7511c69fa4959a9cfae1af641e7e2..6e0ffd154085873840e5d37912c255451c4295e8 100644 (file)
@@ -1242,6 +1242,16 @@ const (
        OpAMD64VADDSUBPD256
        OpAMD64VADDSUBPS128
        OpAMD64VADDSUBPS256
+       OpAMD64VBROADCASTSD256
+       OpAMD64VBROADCASTSD512
+       OpAMD64VBROADCASTSDMasked256
+       OpAMD64VBROADCASTSDMasked512
+       OpAMD64VBROADCASTSS128
+       OpAMD64VBROADCASTSS256
+       OpAMD64VBROADCASTSS512
+       OpAMD64VBROADCASTSSMasked128
+       OpAMD64VBROADCASTSSMasked256
+       OpAMD64VBROADCASTSSMasked512
        OpAMD64VCOMPRESSPDMasked128
        OpAMD64VCOMPRESSPDMasked256
        OpAMD64VCOMPRESSPDMasked512
@@ -1474,6 +1484,30 @@ const (
        OpAMD64VPBLENDMWMasked512
        OpAMD64VPBLENDVB128
        OpAMD64VPBLENDVB256
+       OpAMD64VPBROADCASTB128
+       OpAMD64VPBROADCASTB256
+       OpAMD64VPBROADCASTB512
+       OpAMD64VPBROADCASTBMasked128
+       OpAMD64VPBROADCASTBMasked256
+       OpAMD64VPBROADCASTBMasked512
+       OpAMD64VPBROADCASTD128
+       OpAMD64VPBROADCASTD256
+       OpAMD64VPBROADCASTD512
+       OpAMD64VPBROADCASTDMasked128
+       OpAMD64VPBROADCASTDMasked256
+       OpAMD64VPBROADCASTDMasked512
+       OpAMD64VPBROADCASTQ128
+       OpAMD64VPBROADCASTQ256
+       OpAMD64VPBROADCASTQ512
+       OpAMD64VPBROADCASTQMasked128
+       OpAMD64VPBROADCASTQMasked256
+       OpAMD64VPBROADCASTQMasked512
+       OpAMD64VPBROADCASTW128
+       OpAMD64VPBROADCASTW256
+       OpAMD64VPBROADCASTW512
+       OpAMD64VPBROADCASTWMasked128
+       OpAMD64VPBROADCASTWMasked256
+       OpAMD64VPBROADCASTWMasked512
        OpAMD64VPCMPEQB128
        OpAMD64VPCMPEQB256
        OpAMD64VPCMPEQB512
@@ -2222,10 +2256,10 @@ const (
        OpAMD64VPRORQMasked128
        OpAMD64VPRORQMasked256
        OpAMD64VPRORQMasked512
-       OpAMD64VPINSRB128
-       OpAMD64VPINSRW128
        OpAMD64VPINSRD128
        OpAMD64VPINSRQ128
+       OpAMD64VPINSRB128
+       OpAMD64VPINSRW128
        OpAMD64VINSERTF128256
        OpAMD64VINSERTF64X4512
        OpAMD64VINSERTI128256
@@ -4839,6 +4873,66 @@ const (
        OpAverageUint16x8
        OpAverageUint16x16
        OpAverageUint16x32
+       OpBroadcast128Float32x4
+       OpBroadcast128Float64x2
+       OpBroadcast128Int8x16
+       OpBroadcast128Int16x8
+       OpBroadcast128Int32x4
+       OpBroadcast128Int64x2
+       OpBroadcast128MaskedFloat32x4
+       OpBroadcast128MaskedFloat64x2
+       OpBroadcast128MaskedInt8x16
+       OpBroadcast128MaskedInt16x8
+       OpBroadcast128MaskedInt32x4
+       OpBroadcast128MaskedInt64x2
+       OpBroadcast128MaskedUint8x16
+       OpBroadcast128MaskedUint16x8
+       OpBroadcast128MaskedUint32x4
+       OpBroadcast128MaskedUint64x2
+       OpBroadcast128Uint8x16
+       OpBroadcast128Uint16x8
+       OpBroadcast128Uint32x4
+       OpBroadcast128Uint64x2
+       OpBroadcast256Float32x4
+       OpBroadcast256Float64x2
+       OpBroadcast256Int8x16
+       OpBroadcast256Int16x8
+       OpBroadcast256Int32x4
+       OpBroadcast256Int64x2
+       OpBroadcast256MaskedFloat32x4
+       OpBroadcast256MaskedFloat64x2
+       OpBroadcast256MaskedInt8x16
+       OpBroadcast256MaskedInt16x8
+       OpBroadcast256MaskedInt32x4
+       OpBroadcast256MaskedInt64x2
+       OpBroadcast256MaskedUint8x16
+       OpBroadcast256MaskedUint16x8
+       OpBroadcast256MaskedUint32x4
+       OpBroadcast256MaskedUint64x2
+       OpBroadcast256Uint8x16
+       OpBroadcast256Uint16x8
+       OpBroadcast256Uint32x4
+       OpBroadcast256Uint64x2
+       OpBroadcast512Float32x4
+       OpBroadcast512Float64x2
+       OpBroadcast512Int8x16
+       OpBroadcast512Int16x8
+       OpBroadcast512Int32x4
+       OpBroadcast512Int64x2
+       OpBroadcast512MaskedFloat32x4
+       OpBroadcast512MaskedFloat64x2
+       OpBroadcast512MaskedInt8x16
+       OpBroadcast512MaskedInt16x8
+       OpBroadcast512MaskedInt32x4
+       OpBroadcast512MaskedInt64x2
+       OpBroadcast512MaskedUint8x16
+       OpBroadcast512MaskedUint16x8
+       OpBroadcast512MaskedUint32x4
+       OpBroadcast512MaskedUint64x2
+       OpBroadcast512Uint8x16
+       OpBroadcast512Uint16x8
+       OpBroadcast512Uint32x4
+       OpBroadcast512Uint64x2
        OpCeilFloat32x4
        OpCeilFloat32x8
        OpCeilFloat64x2
@@ -6419,6 +6513,8 @@ const (
        OpRoundToEvenScaledResidueMaskedFloat64x2
        OpRoundToEvenScaledResidueMaskedFloat64x4
        OpRoundToEvenScaledResidueMaskedFloat64x8
+       OpSetElemFloat32x4
+       OpSetElemFloat64x2
        OpSetElemInt8x16
        OpSetElemInt16x8
        OpSetElemInt32x4
@@ -19771,6 +19867,141 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "VBROADCASTSD256",
+               argLen: 1,
+               asm:    x86.AVBROADCASTSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSD512",
+               argLen: 1,
+               asm:    x86.AVBROADCASTSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSDMasked256",
+               argLen: 2,
+               asm:    x86.AVBROADCASTSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSDMasked512",
+               argLen: 2,
+               asm:    x86.AVBROADCASTSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSS128",
+               argLen: 1,
+               asm:    x86.AVBROADCASTSS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSS256",
+               argLen: 1,
+               asm:    x86.AVBROADCASTSS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSS512",
+               argLen: 1,
+               asm:    x86.AVBROADCASTSS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSSMasked128",
+               argLen: 2,
+               asm:    x86.AVBROADCASTSS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSSMasked256",
+               argLen: 2,
+               asm:    x86.AVBROADCASTSS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VBROADCASTSSMasked512",
+               argLen: 2,
+               asm:    x86.AVBROADCASTSS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VCOMPRESSPDMasked128",
                argLen: 2,
@@ -23272,6 +23503,330 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "VPBROADCASTB128",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTB256",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTB512",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTBMasked128",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTBMasked256",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTBMasked512",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTD128",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTD256",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTD512",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTDMasked128",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTDMasked256",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTDMasked512",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTQ128",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTQ256",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTQ512",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTQMasked128",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTQMasked256",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTQMasked512",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTW128",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTW256",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTW512",
+               argLen: 1,
+               asm:    x86.AVPBROADCASTW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTWMasked128",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTWMasked256",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPBROADCASTWMasked512",
+               argLen: 2,
+               asm:    x86.AVPBROADCASTW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:        "VPCMPEQB128",
                argLen:      2,
@@ -34482,10 +35037,10 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:    "VPINSRB128",
+               name:    "VPINSRD128",
                auxType: auxUInt8,
                argLen:  2,
-               asm:     x86.AVPINSRB,
+               asm:     x86.AVPINSRD,
                reg: regInfo{
                        inputs: []inputInfo{
                                {1, 49135},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
@@ -34497,10 +35052,10 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:    "VPINSRW128",
+               name:    "VPINSRQ128",
                auxType: auxUInt8,
                argLen:  2,
-               asm:     x86.AVPINSRW,
+               asm:     x86.AVPINSRQ,
                reg: regInfo{
                        inputs: []inputInfo{
                                {1, 49135},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
@@ -34512,10 +35067,10 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:    "VPINSRD128",
+               name:    "VPINSRB128",
                auxType: auxUInt8,
                argLen:  2,
-               asm:     x86.AVPINSRD,
+               asm:     x86.AVPINSRB,
                reg: regInfo{
                        inputs: []inputInfo{
                                {1, 49135},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
@@ -34527,10 +35082,10 @@ var opcodeTable = [...]opInfo{
                },
        },
        {
-               name:    "VPINSRQ128",
+               name:    "VPINSRW128",
                auxType: auxUInt8,
                argLen:  2,
-               asm:     x86.AVPINSRQ,
+               asm:     x86.AVPINSRW,
                reg: regInfo{
                        inputs: []inputInfo{
                                {1, 49135},      // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
@@ -64725,6 +65280,306 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "Broadcast128Float32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Float64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Int8x16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Int16x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Int32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Int64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedFloat32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedFloat64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedInt8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedInt32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedInt64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedUint8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedUint16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedUint32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128MaskedUint64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Uint8x16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Uint16x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Uint32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast128Uint64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Float32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Float64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Int8x16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Int16x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Int32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Int64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedFloat32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedFloat64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedInt8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedInt32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedInt64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedUint8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedUint16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedUint32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256MaskedUint64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Uint8x16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Uint16x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Uint32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast256Uint64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Float32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Float64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Int8x16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Int16x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Int32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Int64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedFloat32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedFloat64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedInt8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedInt32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedInt64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedUint8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedUint16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedUint32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512MaskedUint64x2",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Uint8x16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Uint16x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Uint32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "Broadcast512Uint64x2",
+               argLen:  1,
+               generic: true,
+       },
        {
                name:    "CeilFloat32x4",
                argLen:  1,
@@ -73153,6 +74008,18 @@ var opcodeTable = [...]opInfo{
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "SetElemFloat32x4",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "SetElemFloat64x2",
+               auxType: auxUInt8,
+               argLen:  2,
+               generic: true,
+       },
        {
                name:    "SetElemInt8x16",
                auxType: auxUInt8,
index c5367adefec4327a0360b959a8dd2c03e04b0399..0bdc0e63b7b5363b7b660b3ea29ab2f8b3dd4a7d 100644 (file)
@@ -1317,6 +1317,156 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpBitLen64(v)
        case OpBitLen8:
                return rewriteValueAMD64_OpBitLen8(v)
+       case OpBroadcast128Float32x4:
+               v.Op = OpAMD64VBROADCASTSS128
+               return true
+       case OpBroadcast128Float64x2:
+               v.Op = OpAMD64VPBROADCASTQ128
+               return true
+       case OpBroadcast128Int16x8:
+               v.Op = OpAMD64VPBROADCASTW128
+               return true
+       case OpBroadcast128Int32x4:
+               v.Op = OpAMD64VPBROADCASTD128
+               return true
+       case OpBroadcast128Int64x2:
+               v.Op = OpAMD64VPBROADCASTQ128
+               return true
+       case OpBroadcast128Int8x16:
+               v.Op = OpAMD64VPBROADCASTB128
+               return true
+       case OpBroadcast128MaskedFloat32x4:
+               return rewriteValueAMD64_OpBroadcast128MaskedFloat32x4(v)
+       case OpBroadcast128MaskedFloat64x2:
+               return rewriteValueAMD64_OpBroadcast128MaskedFloat64x2(v)
+       case OpBroadcast128MaskedInt16x8:
+               return rewriteValueAMD64_OpBroadcast128MaskedInt16x8(v)
+       case OpBroadcast128MaskedInt32x4:
+               return rewriteValueAMD64_OpBroadcast128MaskedInt32x4(v)
+       case OpBroadcast128MaskedInt64x2:
+               return rewriteValueAMD64_OpBroadcast128MaskedInt64x2(v)
+       case OpBroadcast128MaskedInt8x16:
+               return rewriteValueAMD64_OpBroadcast128MaskedInt8x16(v)
+       case OpBroadcast128MaskedUint16x8:
+               return rewriteValueAMD64_OpBroadcast128MaskedUint16x8(v)
+       case OpBroadcast128MaskedUint32x4:
+               return rewriteValueAMD64_OpBroadcast128MaskedUint32x4(v)
+       case OpBroadcast128MaskedUint64x2:
+               return rewriteValueAMD64_OpBroadcast128MaskedUint64x2(v)
+       case OpBroadcast128MaskedUint8x16:
+               return rewriteValueAMD64_OpBroadcast128MaskedUint8x16(v)
+       case OpBroadcast128Uint16x8:
+               v.Op = OpAMD64VPBROADCASTW128
+               return true
+       case OpBroadcast128Uint32x4:
+               v.Op = OpAMD64VPBROADCASTD128
+               return true
+       case OpBroadcast128Uint64x2:
+               v.Op = OpAMD64VPBROADCASTQ128
+               return true
+       case OpBroadcast128Uint8x16:
+               v.Op = OpAMD64VPBROADCASTB128
+               return true
+       case OpBroadcast256Float32x4:
+               v.Op = OpAMD64VBROADCASTSS256
+               return true
+       case OpBroadcast256Float64x2:
+               v.Op = OpAMD64VBROADCASTSD256
+               return true
+       case OpBroadcast256Int16x8:
+               v.Op = OpAMD64VPBROADCASTW256
+               return true
+       case OpBroadcast256Int32x4:
+               v.Op = OpAMD64VPBROADCASTD256
+               return true
+       case OpBroadcast256Int64x2:
+               v.Op = OpAMD64VPBROADCASTQ256
+               return true
+       case OpBroadcast256Int8x16:
+               v.Op = OpAMD64VPBROADCASTB256
+               return true
+       case OpBroadcast256MaskedFloat32x4:
+               return rewriteValueAMD64_OpBroadcast256MaskedFloat32x4(v)
+       case OpBroadcast256MaskedFloat64x2:
+               return rewriteValueAMD64_OpBroadcast256MaskedFloat64x2(v)
+       case OpBroadcast256MaskedInt16x8:
+               return rewriteValueAMD64_OpBroadcast256MaskedInt16x8(v)
+       case OpBroadcast256MaskedInt32x4:
+               return rewriteValueAMD64_OpBroadcast256MaskedInt32x4(v)
+       case OpBroadcast256MaskedInt64x2:
+               return rewriteValueAMD64_OpBroadcast256MaskedInt64x2(v)
+       case OpBroadcast256MaskedInt8x16:
+               return rewriteValueAMD64_OpBroadcast256MaskedInt8x16(v)
+       case OpBroadcast256MaskedUint16x8:
+               return rewriteValueAMD64_OpBroadcast256MaskedUint16x8(v)
+       case OpBroadcast256MaskedUint32x4:
+               return rewriteValueAMD64_OpBroadcast256MaskedUint32x4(v)
+       case OpBroadcast256MaskedUint64x2:
+               return rewriteValueAMD64_OpBroadcast256MaskedUint64x2(v)
+       case OpBroadcast256MaskedUint8x16:
+               return rewriteValueAMD64_OpBroadcast256MaskedUint8x16(v)
+       case OpBroadcast256Uint16x8:
+               v.Op = OpAMD64VPBROADCASTW256
+               return true
+       case OpBroadcast256Uint32x4:
+               v.Op = OpAMD64VPBROADCASTD256
+               return true
+       case OpBroadcast256Uint64x2:
+               v.Op = OpAMD64VPBROADCASTQ256
+               return true
+       case OpBroadcast256Uint8x16:
+               v.Op = OpAMD64VPBROADCASTB256
+               return true
+       case OpBroadcast512Float32x4:
+               v.Op = OpAMD64VBROADCASTSS512
+               return true
+       case OpBroadcast512Float64x2:
+               v.Op = OpAMD64VBROADCASTSD512
+               return true
+       case OpBroadcast512Int16x8:
+               v.Op = OpAMD64VPBROADCASTW512
+               return true
+       case OpBroadcast512Int32x4:
+               v.Op = OpAMD64VPBROADCASTD512
+               return true
+       case OpBroadcast512Int64x2:
+               v.Op = OpAMD64VPBROADCASTQ512
+               return true
+       case OpBroadcast512Int8x16:
+               v.Op = OpAMD64VPBROADCASTB512
+               return true
+       case OpBroadcast512MaskedFloat32x4:
+               return rewriteValueAMD64_OpBroadcast512MaskedFloat32x4(v)
+       case OpBroadcast512MaskedFloat64x2:
+               return rewriteValueAMD64_OpBroadcast512MaskedFloat64x2(v)
+       case OpBroadcast512MaskedInt16x8:
+               return rewriteValueAMD64_OpBroadcast512MaskedInt16x8(v)
+       case OpBroadcast512MaskedInt32x4:
+               return rewriteValueAMD64_OpBroadcast512MaskedInt32x4(v)
+       case OpBroadcast512MaskedInt64x2:
+               return rewriteValueAMD64_OpBroadcast512MaskedInt64x2(v)
+       case OpBroadcast512MaskedInt8x16:
+               return rewriteValueAMD64_OpBroadcast512MaskedInt8x16(v)
+       case OpBroadcast512MaskedUint16x8:
+               return rewriteValueAMD64_OpBroadcast512MaskedUint16x8(v)
+       case OpBroadcast512MaskedUint32x4:
+               return rewriteValueAMD64_OpBroadcast512MaskedUint32x4(v)
+       case OpBroadcast512MaskedUint64x2:
+               return rewriteValueAMD64_OpBroadcast512MaskedUint64x2(v)
+       case OpBroadcast512MaskedUint8x16:
+               return rewriteValueAMD64_OpBroadcast512MaskedUint8x16(v)
+       case OpBroadcast512Uint16x8:
+               v.Op = OpAMD64VPBROADCASTW512
+               return true
+       case OpBroadcast512Uint32x4:
+               v.Op = OpAMD64VPBROADCASTD512
+               return true
+       case OpBroadcast512Uint64x2:
+               v.Op = OpAMD64VPBROADCASTQ512
+               return true
+       case OpBroadcast512Uint8x16:
+               v.Op = OpAMD64VPBROADCASTB512
+               return true
        case OpBswap16:
                return rewriteValueAMD64_OpBswap16(v)
        case OpBswap32:
@@ -4539,6 +4689,12 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpSelect1(v)
        case OpSelectN:
                return rewriteValueAMD64_OpSelectN(v)
+       case OpSetElemFloat32x4:
+               v.Op = OpAMD64VPINSRD128
+               return true
+       case OpSetElemFloat64x2:
+               v.Op = OpAMD64VPINSRQ128
+               return true
        case OpSetElemInt16x8:
                v.Op = OpAMD64VPINSRW128
                return true
@@ -31628,6 +31784,486 @@ func rewriteValueAMD64_OpBitLen8(v *Value) bool {
        }
        return false
 }
+func rewriteValueAMD64_OpBroadcast128MaskedFloat32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedFloat32x4 x mask)
+       // result: (VBROADCASTSSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VBROADCASTSSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedFloat64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedFloat64x2 x mask)
+       // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTQMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedInt16x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedInt16x8 x mask)
+       // result: (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedInt32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedInt32x4 x mask)
+       // result: (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedInt64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedInt64x2 x mask)
+       // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTQMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedInt8x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedInt8x16 x mask)
+       // result: (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedUint16x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedUint16x8 x mask)
+       // result: (VPBROADCASTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedUint32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedUint32x4 x mask)
+       // result: (VPBROADCASTDMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedUint64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedUint64x2 x mask)
+       // result: (VPBROADCASTQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTQMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast128MaskedUint8x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast128MaskedUint8x16 x mask)
+       // result: (VPBROADCASTBMasked128 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedFloat32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedFloat32x4 x mask)
+       // result: (VBROADCASTSSMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VBROADCASTSSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedFloat64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedFloat64x2 x mask)
+       // result: (VBROADCASTSDMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VBROADCASTSDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedInt16x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedInt16x8 x mask)
+       // result: (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedInt32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedInt32x4 x mask)
+       // result: (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedInt64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedInt64x2 x mask)
+       // result: (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTQMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedInt8x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedInt8x16 x mask)
+       // result: (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedUint16x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedUint16x8 x mask)
+       // result: (VPBROADCASTWMasked256 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedUint32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedUint32x4 x mask)
+       // result: (VPBROADCASTDMasked256 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedUint64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedUint64x2 x mask)
+       // result: (VPBROADCASTQMasked256 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTQMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast256MaskedUint8x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast256MaskedUint8x16 x mask)
+       // result: (VPBROADCASTBMasked256 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedFloat32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedFloat32x4 x mask)
+       // result: (VBROADCASTSSMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VBROADCASTSSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedFloat64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedFloat64x2 x mask)
+       // result: (VBROADCASTSDMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VBROADCASTSDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedInt16x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedInt16x8 x mask)
+       // result: (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedInt32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedInt32x4 x mask)
+       // result: (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedInt64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedInt64x2 x mask)
+       // result: (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTQMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedInt8x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedInt8x16 x mask)
+       // result: (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedUint16x8(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedUint16x8 x mask)
+       // result: (VPBROADCASTWMasked512 x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedUint32x4(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedUint32x4 x mask)
+       // result: (VPBROADCASTDMasked512 x (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedUint64x2(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedUint64x2 x mask)
+       // result: (VPBROADCASTQMasked512 x (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTQMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpBroadcast512MaskedUint8x16(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Broadcast512MaskedUint8x16 x mask)
+       // result: (VPBROADCASTBMasked512 x (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               mask := v_1
+               v.reset(OpAMD64VPBROADCASTBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg2(x, v0)
+               return true
+       }
+}
 func rewriteValueAMD64_OpBswap16(v *Value) bool {
        v_0 := v.Args[0]
        // match: (Bswap16 x)
index e14e02a71e54447f216928a00c90a7e553fa19a3..7a95a4450d4ce246b36f4f1c971eb8761f399fc6 100644 (file)
@@ -240,6 +240,66 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Uint16x8.AverageMasked", opLen3(ssa.OpAverageMaskedUint16x8, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint16x16.AverageMasked", opLen3(ssa.OpAverageMaskedUint16x16, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Uint16x32.AverageMasked", opLen3(ssa.OpAverageMaskedUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.Broadcast128", opLen1(ssa.OpBroadcast128Float32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x2.Broadcast128", opLen1(ssa.OpBroadcast128Float64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x16.Broadcast128", opLen1(ssa.OpBroadcast128Int8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x8.Broadcast128", opLen1(ssa.OpBroadcast128Int16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.Broadcast128", opLen1(ssa.OpBroadcast128Int32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x2.Broadcast128", opLen1(ssa.OpBroadcast128Int64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Broadcast128", opLen1(ssa.OpBroadcast128Uint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Broadcast128", opLen1(ssa.OpBroadcast128Uint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Broadcast128", opLen1(ssa.OpBroadcast128Uint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Broadcast128", opLen1(ssa.OpBroadcast128Uint64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x4.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedFloat32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x2.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedFloat64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x16.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x8.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x2.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedInt64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Broadcast128Masked", opLen2(ssa.OpBroadcast128MaskedUint64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x4.Broadcast256", opLen1(ssa.OpBroadcast256Float32x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x2.Broadcast256", opLen1(ssa.OpBroadcast256Float64x2, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x16.Broadcast256", opLen1(ssa.OpBroadcast256Int8x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x8.Broadcast256", opLen1(ssa.OpBroadcast256Int16x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x4.Broadcast256", opLen1(ssa.OpBroadcast256Int32x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x2.Broadcast256", opLen1(ssa.OpBroadcast256Int64x2, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Broadcast256", opLen1(ssa.OpBroadcast256Uint8x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Broadcast256", opLen1(ssa.OpBroadcast256Uint16x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Broadcast256", opLen1(ssa.OpBroadcast256Uint32x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Broadcast256", opLen1(ssa.OpBroadcast256Uint64x2, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x4.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedFloat32x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x2.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedFloat64x2, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x16.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedInt8x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x8.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedInt16x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x4.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedInt32x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x2.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedInt64x2, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedUint8x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedUint16x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedUint32x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Broadcast256Masked", opLen2(ssa.OpBroadcast256MaskedUint64x2, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x4.Broadcast512", opLen1(ssa.OpBroadcast512Float32x4, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x2.Broadcast512", opLen1(ssa.OpBroadcast512Float64x2, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.Broadcast512", opLen1(ssa.OpBroadcast512Int8x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.Broadcast512", opLen1(ssa.OpBroadcast512Int16x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.Broadcast512", opLen1(ssa.OpBroadcast512Int32x4, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x2.Broadcast512", opLen1(ssa.OpBroadcast512Int64x2, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Broadcast512", opLen1(ssa.OpBroadcast512Uint8x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Broadcast512", opLen1(ssa.OpBroadcast512Uint16x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Broadcast512", opLen1(ssa.OpBroadcast512Uint32x4, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Broadcast512", opLen1(ssa.OpBroadcast512Uint64x2, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedFloat32x4, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x2.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedFloat64x2, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedInt8x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedInt16x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedInt32x4, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x2.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedInt64x2, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedUint8x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedUint16x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedUint32x4, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Broadcast512Masked", opLen2(ssa.OpBroadcast512MaskedUint64x2, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Float32x4.Ceil", opLen1(ssa.OpCeilFloat32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
@@ -1408,6 +1468,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Float64x2.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float64x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x4, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Float64x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Float64x2.SetElem", opLen2Imm8(ssa.OpSetElemFloat64x2, types.TypeVec128, 0), sys.AMD64)
        addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64)
        addF(simdPackage, "Int16x8.SetElem", opLen2Imm8(ssa.OpSetElemInt16x8, types.TypeVec128, 0), sys.AMD64)
        addF(simdPackage, "Int32x4.SetElem", opLen2Imm8(ssa.OpSetElemInt32x4, types.TypeVec128, 0), sys.AMD64)
index c7c6aae374265d36c9cb9badb6b40d4aa1cb0f16..8b36da71ab95a8c0f381c06f47ff775674868974 100644 (file)
@@ -87,6 +87,23 @@ var ternaryFlaky = &shapes{ // for tests that support flaky equality
        floats: []int{32},
 }
 
+type templateData struct {
+       Vec    string // the type of the vector, e.g. Float32x4
+       AOrAn  string // for documentation, the article "a" or "an"
+       Width  int    // the bit width of the element type, e.g. 32
+       Vwidth int    // the width of the vector type, e.g. 128
+       Count  int    // the number of elements, e.g. 4
+       WxC    string // the width-by-type string, e.g., "32x4"
+       BxC    string // as if bytes, in the proper count, e.g., "8x16" (W==8)
+       Base   string // the capitalized Base Type of the vector, e.g., "Float"
+       Type   string // the element type, e.g. "float32"
+       OxFF   string // a mask for the lowest 'count' bits
+}
+
+func (t templateData) As128BitVec() string {
+       return fmt.Sprintf("%s%dx%d", t.Base, t.Width, 128/t.Width)
+}
+
 func oneTemplate(t *template.Template, baseType string, width, count int, out io.Writer) {
        b := width * count
        if b < 128 || b > 512 {
@@ -102,26 +119,17 @@ func oneTemplate(t *template.Template, baseType string, width, count int, out io
                aOrAn = "an"
        }
        oxFF := fmt.Sprintf("0x%x", uint64((1<<count)-1))
-       t.Execute(out, struct {
-               Vec   string // the type of the vector, e.g. Float32x4
-               AOrAn string // for documentation, the article "a" or "an"
-               Width int    // the bit width of the element type, e.g. 32
-               Count int    // the number of elements, e.g. 4
-               WxC   string // the width-by-type string, e.g., "32x4"
-               BxC   string // as if bytes, in the proper count, e.g., "8x16" (W==8)
-               Base  string // the capitalized Base Type of the vector, e.g., "Float"
-               Type  string // the element type, e.g. "float32"
-               OxFF  string // a mask for the lowest 'count' bits
-       }{
-               Vec:   vType,
-               AOrAn: aOrAn,
-               Width: width,
-               Count: count,
-               WxC:   wxc,
-               BxC:   bxc,
-               Base:  BaseType,
-               Type:  eType,
-               OxFF:  oxFF,
+       t.Execute(out, templateData{
+               Vec:    vType,
+               AOrAn:  aOrAn,
+               Width:  width,
+               Vwidth: b,
+               Count:  count,
+               WxC:    wxc,
+               BxC:    bxc,
+               Base:   BaseType,
+               Type:   eType,
+               OxFF:   oxFF,
        })
 }
 
@@ -480,7 +488,7 @@ func (x {{.Vec}}) StoreSlicePart(s []{{.Type}}) {
 
 var unsafePATemplate = templateOf("unsafe PA helper", `
 // pa{{.Vec}} returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only 
+// only be used with partial load/store operations that only
 // access the known-safe portions of the array.
 func pa{{.Vec}}(s []{{.Type}}) *[{{.Count}}]{{.Type}} {
        return (*[{{.Count}}]{{.Type}})(unsafe.Pointer(&s[0]))
@@ -500,7 +508,7 @@ func (x {{.Vec}}) Masked(mask Mask{{.WxC}}) {{.Vec}} {
 
 // Merge returns x but with elements set to y where mask is false.
 func (x {{.Vec}}) Merge(y {{.Vec}}, mask Mask{{.WxC}}) {{.Vec}} {
-{{- if eq .BxC .WxC }}
+{{- if eq .BxC .WxC -}}
        im := mask.AsInt{{.BxC}}()
 {{- else}}
     im := mask.AsInt{{.WxC}}().AsInt{{.BxC}}()
@@ -539,6 +547,32 @@ func (x {{.Vec}}) Merge(y {{.Vec}}, mask Mask{{.WxC}}) {{.Vec}} {
 }
 `)
 
+func (t templateData) CPUfeatureBC() string {
+       switch t.Vwidth {
+       case 128:
+               return "AVX2"
+       case 256:
+               return "AVX2"
+       case 512:
+               if t.Width <= 16 {
+                       return "AVX512BW"
+               }
+               return "AVX512F"
+       }
+       panic(fmt.Errorf("unexpected vector width %d", t.Vwidth))
+}
+
+var broadcastTemplate = templateOf("Broadcast functions", `
+// Broadcast{{.Vec}} returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature {{.CPUfeatureBC}}
+func Broadcast{{.Vec}}(x {{.Type}}) {{.Vec}} {
+       var z {{.As128BitVec }}
+       return z.SetElem(0, x).Broadcast{{.Vwidth}}()
+}
+`)
+
 func main() {
        sl := flag.String("sl", "slice_amd64.go", "file name for slice operations")
        ush := flag.String("ush", "unsafe_helpers.go", "file name for unsafe helpers")
@@ -557,6 +591,7 @@ func main() {
                        avx2SmallLoadSlicePartTemplate,
                        avx2MaskedTemplate,
                        avx512MaskedTemplate,
+                       broadcastTemplate,
                )
        }
        if *ush != "" {
index 43f36de2b553c13c67de7012d178e239ba7383b8..5b7754a9611b50e35b6cf03706fa7495dabec2f0 100644 (file)
@@ -1386,6 +1386,438 @@ func (x Uint16x16) AverageMasked(y Uint16x16, mask Mask16x16) Uint16x16
 // Asm: VPAVGW, CPU Feature: AVX512
 func (x Uint16x32) AverageMasked(y Uint16x32, mask Mask16x32) Uint16x32
 
+/* Broadcast128 */
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) Broadcast128() Float32x4
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Float64x2) Broadcast128() Float64x2
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) Broadcast128() Int8x16
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) Broadcast128() Int16x8
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) Broadcast128() Int32x4
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) Broadcast128() Int64x2
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) Broadcast128() Uint8x16
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) Broadcast128() Uint16x8
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Uint32x4) Broadcast128() Uint32x4
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) Broadcast128() Uint64x2
+
+/* Broadcast128Masked */
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) Broadcast128Masked(mask Mask32x4) Float32x4
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Float64x2) Broadcast128Masked(mask Mask64x2) Float64x2
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) Broadcast128Masked(mask Mask8x16) Int8x16
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) Broadcast128Masked(mask Mask16x8) Int16x8
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) Broadcast128Masked(mask Mask32x4) Int32x4
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) Broadcast128Masked(mask Mask64x2) Int64x2
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) Broadcast128Masked(mask Mask8x16) Uint8x16
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) Broadcast128Masked(mask Mask16x8) Uint16x8
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) Broadcast128Masked(mask Mask32x4) Uint32x4
+
+// Broadcast128Masked copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) Broadcast128Masked(mask Mask64x2) Uint64x2
+
+/* Broadcast256 */
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) Broadcast256() Float32x8
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX2
+func (x Float64x2) Broadcast256() Float64x4
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) Broadcast256() Int8x32
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) Broadcast256() Int16x16
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) Broadcast256() Int32x8
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) Broadcast256() Int64x4
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) Broadcast256() Uint8x32
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) Broadcast256() Uint16x16
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Uint32x4) Broadcast256() Uint32x8
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) Broadcast256() Uint64x4
+
+/* Broadcast256Masked */
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) Broadcast256Masked(mask Mask32x4) Float32x8
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) Broadcast256Masked(mask Mask64x2) Float64x4
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) Broadcast256Masked(mask Mask8x16) Int8x32
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) Broadcast256Masked(mask Mask16x8) Int16x16
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) Broadcast256Masked(mask Mask32x4) Int32x8
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) Broadcast256Masked(mask Mask64x2) Int64x4
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) Broadcast256Masked(mask Mask8x16) Uint8x32
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) Broadcast256Masked(mask Mask16x8) Uint16x16
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) Broadcast256Masked(mask Mask32x4) Uint32x8
+
+// Broadcast256Masked copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) Broadcast256Masked(mask Mask64x2) Uint64x4
+
+/* Broadcast512 */
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) Broadcast512() Float32x16
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) Broadcast512() Float64x8
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) Broadcast512() Int8x64
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) Broadcast512() Int16x32
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) Broadcast512() Int32x16
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) Broadcast512() Int64x8
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) Broadcast512() Uint8x64
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) Broadcast512() Uint16x32
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) Broadcast512() Uint32x16
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) Broadcast512() Uint64x8
+
+/* Broadcast512Masked */
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) Broadcast512Masked(mask Mask32x4) Float32x16
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) Broadcast512Masked(mask Mask64x2) Float64x8
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) Broadcast512Masked(mask Mask8x16) Int8x64
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) Broadcast512Masked(mask Mask16x8) Int16x32
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) Broadcast512Masked(mask Mask32x4) Int32x16
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) Broadcast512Masked(mask Mask64x2) Int64x8
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) Broadcast512Masked(mask Mask8x16) Uint8x64
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) Broadcast512Masked(mask Mask16x8) Uint16x32
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) Broadcast512Masked(mask Mask32x4) Uint32x16
+
+// Broadcast512Masked copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) Broadcast512Masked(mask Mask64x2) Uint64x8
+
 /* Ceil */
 
 // Ceil rounds elements up to the nearest integer.
@@ -9116,6 +9548,20 @@ func (x Float64x8) ScaleMasked(y Float64x8, mask Mask64x8) Float64x8
 
 /* SetElem */
 
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRD, CPU Feature: AVX
+func (x Float32x4) SetElem(index uint8, y float32) Float32x4
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRQ, CPU Feature: AVX
+func (x Float64x2) SetElem(index uint8, y float64) Float64x2
+
 // SetElem sets a single constant-indexed element's value.
 //
 // index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
index f28daf1bc7314dac18a9dcbf7bffc9a99a0990c8..831dc4f268bdfbe3b5b75943efac69ce9997b6ce 100644 (file)
@@ -412,3 +412,15 @@ func TestRotateAllVariable(t *testing.T) {
                }
        }
 }
+
+func TestBroadcastUint32x4(t *testing.T) {
+       s := make([]uint32, 4, 4)
+       simd.BroadcastUint32x4(123456789).StoreSlice(s)
+       checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
+}
+
+func TestBroadcastFloat32x8(t *testing.T) {
+       s := make([]float32, 8, 8)
+       simd.BroadcastFloat32x8(123456789).StoreSlice(s)
+       checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
+}
index a43660cba4e9b919176382dd31eef52c42252925..8e721d90279680a5aa5319cf4044de5d89479856 100644 (file)
@@ -1499,3 +1499,273 @@ func (x Float64x8) Merge(y Float64x8, mask Mask64x8) Float64x8 {
        iy := y.AsInt64x8()
        return iy.blendMasked(ix, mask).AsFloat64x8()
 }
+
+// BroadcastInt8x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt8x16(x int8) Int8x16 {
+       var z Int8x16
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastInt16x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt16x8(x int16) Int16x8 {
+       var z Int16x8
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastInt32x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt32x4(x int32) Int32x4 {
+       var z Int32x4
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastInt64x2 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt64x2(x int64) Int64x2 {
+       var z Int64x2
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastUint8x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint8x16(x uint8) Uint8x16 {
+       var z Uint8x16
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastUint16x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint16x8(x uint16) Uint16x8 {
+       var z Uint16x8
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastUint32x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint32x4(x uint32) Uint32x4 {
+       var z Uint32x4
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastUint64x2 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint64x2(x uint64) Uint64x2 {
+       var z Uint64x2
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastFloat32x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastFloat32x4(x float32) Float32x4 {
+       var z Float32x4
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastFloat64x2 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastFloat64x2(x float64) Float64x2 {
+       var z Float64x2
+       return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastInt8x32 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt8x32(x int8) Int8x32 {
+       var z Int8x16
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastInt16x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt16x16(x int16) Int16x16 {
+       var z Int16x8
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastInt32x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt32x8(x int32) Int32x8 {
+       var z Int32x4
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastInt64x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt64x4(x int64) Int64x4 {
+       var z Int64x2
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastUint8x32 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint8x32(x uint8) Uint8x32 {
+       var z Uint8x16
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastUint16x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint16x16(x uint16) Uint16x16 {
+       var z Uint16x8
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastUint32x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint32x8(x uint32) Uint32x8 {
+       var z Uint32x4
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastUint64x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint64x4(x uint64) Uint64x4 {
+       var z Uint64x2
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastFloat32x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastFloat32x8(x float32) Float32x8 {
+       var z Float32x4
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastFloat64x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastFloat64x4(x float64) Float64x4 {
+       var z Float64x2
+       return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastInt8x64 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512BW
+func BroadcastInt8x64(x int8) Int8x64 {
+       var z Int8x16
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastInt16x32 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512BW
+func BroadcastInt16x32(x int16) Int16x32 {
+       var z Int16x8
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastInt32x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastInt32x16(x int32) Int32x16 {
+       var z Int32x4
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastInt64x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastInt64x8(x int64) Int64x8 {
+       var z Int64x2
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastUint8x64 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512BW
+func BroadcastUint8x64(x uint8) Uint8x64 {
+       var z Uint8x16
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastUint16x32 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512BW
+func BroadcastUint16x32(x uint16) Uint16x32 {
+       var z Uint16x8
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastUint32x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastUint32x16(x uint32) Uint32x16 {
+       var z Uint32x4
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastUint64x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastUint64x8(x uint64) Uint64x8 {
+       var z Uint64x2
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastFloat32x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastFloat32x16(x float32) Float32x16 {
+       var z Float32x4
+       return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastFloat64x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastFloat64x8(x float64) Float64x8 {
+       var z Float64x2
+       return z.SetElem(0, x).Broadcast512()
+}