]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile: add more dot products
authorJunyang Shao <shaojunyang@google.com>
Thu, 12 Jun 2025 16:43:10 +0000 (16:43 +0000)
committerJunyang Shao <shaojunyang@google.com>
Fri, 13 Jun 2025 19:15:18 +0000 (12:15 -0700)
This CL is generated by CL 680215.

Change-Id: Ie085e65e0473a8e96170702d7265d379ec8812ba
Reviewed-on: https://go-review.googlesource.com/c/go/+/681298
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/cmd/compile/internal/amd64/simdssa.go
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/cmd/compile/internal/ssagen/simdintrinsics.go
src/simd/stubs_amd64.go

index 02353c7f7b47b1da3a9184ddb4e19863a5267505..7e9abbd3cbe3170fd1e17babf09ae2c4761b1f1b 100644 (file)
@@ -679,6 +679,34 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPCMPBMasked512:
                p = simdFp2k1k1Imm8(s, v)
 
+       case ssa.OpAMD64VPDPWSSD128,
+               ssa.OpAMD64VPDPWSSD256,
+               ssa.OpAMD64VPDPWSSD512,
+               ssa.OpAMD64VPDPWSSDS128,
+               ssa.OpAMD64VPDPWSSDS256,
+               ssa.OpAMD64VPDPWSSDS512,
+               ssa.OpAMD64VPDPBUSDS128,
+               ssa.OpAMD64VPDPBUSDS256,
+               ssa.OpAMD64VPDPBUSDS512,
+               ssa.OpAMD64VPDPBUSD128,
+               ssa.OpAMD64VPDPBUSD256,
+               ssa.OpAMD64VPDPBUSD512:
+               p = simdFp31ResultInArg0(s, v)
+
+       case ssa.OpAMD64VPDPWSSDMasked512,
+               ssa.OpAMD64VPDPWSSDMasked128,
+               ssa.OpAMD64VPDPWSSDMasked256,
+               ssa.OpAMD64VPDPWSSDSMasked512,
+               ssa.OpAMD64VPDPWSSDSMasked128,
+               ssa.OpAMD64VPDPWSSDSMasked256,
+               ssa.OpAMD64VPDPBUSDSMasked512,
+               ssa.OpAMD64VPDPBUSDSMasked128,
+               ssa.OpAMD64VPDPBUSDSMasked256,
+               ssa.OpAMD64VPDPBUSDMasked512,
+               ssa.OpAMD64VPDPBUSDMasked128,
+               ssa.OpAMD64VPDPBUSDMasked256:
+               p = simdFp3k1fp1ResultInArg0(s, v)
+
        default:
                // Unknown reg shape
                return false
@@ -884,6 +912,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPMADDWDMasked256,
                ssa.OpAMD64VPMADDWDMasked512,
                ssa.OpAMD64VPMADDWDMasked128,
+               ssa.OpAMD64VPDPWSSDMasked512,
+               ssa.OpAMD64VPDPWSSDMasked128,
+               ssa.OpAMD64VPDPWSSDMasked256,
                ssa.OpAMD64VPOPCNTWMasked256,
                ssa.OpAMD64VPOPCNTWMasked512,
                ssa.OpAMD64VPOPCNTWMasked128,
@@ -902,6 +933,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPADDSBMasked128,
                ssa.OpAMD64VPADDSBMasked256,
                ssa.OpAMD64VPADDSBMasked512,
+               ssa.OpAMD64VPDPWSSDSMasked512,
+               ssa.OpAMD64VPDPWSSDSMasked128,
+               ssa.OpAMD64VPDPWSSDSMasked256,
                ssa.OpAMD64VPSUBSWMasked256,
                ssa.OpAMD64VPSUBSWMasked512,
                ssa.OpAMD64VPSUBSWMasked128,
@@ -911,6 +945,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPMADDUBSWMasked256,
                ssa.OpAMD64VPMADDUBSWMasked512,
                ssa.OpAMD64VPMADDUBSWMasked128,
+               ssa.OpAMD64VPDPBUSDSMasked512,
+               ssa.OpAMD64VPDPBUSDSMasked128,
+               ssa.OpAMD64VPDPBUSDSMasked256,
                ssa.OpAMD64VSQRTPSMasked512,
                ssa.OpAMD64VSQRTPSMasked128,
                ssa.OpAMD64VSQRTPSMasked256,
@@ -929,6 +966,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPSUBBMasked128,
                ssa.OpAMD64VPSUBBMasked256,
                ssa.OpAMD64VPSUBBMasked512,
+               ssa.OpAMD64VPDPBUSDMasked512,
+               ssa.OpAMD64VPDPBUSDMasked128,
+               ssa.OpAMD64VPDPBUSDMasked256,
                ssa.OpAMD64VXORPSMasked512,
                ssa.OpAMD64VXORPSMasked128,
                ssa.OpAMD64VXORPSMasked256,
index d5caf09daccd6fa13b8a56504b440b4f2eb1197d..efee484b9993c65130fb428e34ab7ada1b62f347 100644 (file)
 (MaskedPairDotProdInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (MaskedPairDotProdInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (MaskedPairDotProdInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(MaskedPairDotProdAccumulateInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedPairDotProdAccumulateInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedPairDotProdAccumulateInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (MaskedPopCountInt16x16 x mask) => (VPOPCNTWMasked256 x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (MaskedPopCountInt16x32 x mask) => (VPOPCNTWMasked512 x (VPMOVVec16x32ToM <types.TypeMask> mask))
 (MaskedPopCountInt16x8 x mask) => (VPOPCNTWMasked128 x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (MaskedSaturatedAddUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (MaskedSaturatedAddUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (MaskedSaturatedAddUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(MaskedSaturatedPairDotProdAccumulateInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedSaturatedPairDotProdAccumulateInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedSaturatedPairDotProdAccumulateInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (MaskedSaturatedSubInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (MaskedSaturatedSubInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (MaskedSaturatedSubInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
 (MaskedSaturatedUnsignedSignedPairDotProdUint16x16 x y mask) => (VPMADDUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (MaskedSaturatedUnsignedSignedPairDotProdUint16x32 x y mask) => (VPMADDUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (MaskedSaturatedUnsignedSignedPairDotProdUint16x8 x y mask) => (VPMADDUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (MaskedSqrtFloat32x16 x mask) => (VSQRTPSMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
 (MaskedSqrtFloat32x4 x mask) => (VSQRTPSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (MaskedSqrtFloat32x8 x mask) => (VSQRTPSMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (MaskedTruncWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (MaskedTruncWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (MaskedTruncWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedUnsignedSignedQuadDotProdAccumulateInt32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedUnsignedSignedQuadDotProdAccumulateInt32x4 x y z mask) => (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedUnsignedSignedQuadDotProdAccumulateInt32x8 x y z mask) => (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedUnsignedSignedQuadDotProdAccumulateUint32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedUnsignedSignedQuadDotProdAccumulateUint32x4 x y z mask) => (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedUnsignedSignedQuadDotProdAccumulateUint32x8 x y z mask) => (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (MaskedXorFloat32x16 x y mask) => (VXORPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (MaskedXorFloat32x4 x y mask) => (VXORPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
 (MaskedXorFloat32x8 x y mask) => (VXORPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
 (PairDotProdInt16x16 ...) => (VPMADDWD256 ...)
 (PairDotProdInt16x32 ...) => (VPMADDWD512 ...)
 (PairDotProdInt16x8 ...) => (VPMADDWD128 ...)
+(PairDotProdAccumulateInt32x16 ...) => (VPDPWSSD512 ...)
+(PairDotProdAccumulateInt32x4 ...) => (VPDPWSSD128 ...)
+(PairDotProdAccumulateInt32x8 ...) => (VPDPWSSD256 ...)
 (PairwiseAddFloat32x4 ...) => (VHADDPS128 ...)
 (PairwiseAddFloat32x8 ...) => (VHADDPS256 ...)
 (PairwiseAddFloat64x2 ...) => (VHADDPD128 ...)
 (SaturatedAddUint8x16 ...) => (VPADDSB128 ...)
 (SaturatedAddUint8x32 ...) => (VPADDSB256 ...)
 (SaturatedAddUint8x64 ...) => (VPADDSB512 ...)
+(SaturatedPairDotProdAccumulateInt32x16 ...) => (VPDPWSSDS512 ...)
+(SaturatedPairDotProdAccumulateInt32x4 ...) => (VPDPWSSDS128 ...)
+(SaturatedPairDotProdAccumulateInt32x8 ...) => (VPDPWSSDS256 ...)
 (SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...)
 (SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...)
 (SaturatedPairwiseSubInt16x16 ...) => (VPHSUBSW256 ...)
 (SaturatedUnsignedSignedPairDotProdUint16x8 ...) => (VPMADDUBSW128 ...)
 (SaturatedUnsignedSignedPairDotProdUint8x16 ...) => (VPMADDUBSW128 ...)
 (SaturatedUnsignedSignedPairDotProdUint8x32 ...) => (VPMADDUBSW256 ...)
+(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSDS512 ...)
+(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSDS128 ...)
+(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSDS256 ...)
+(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 ...) => (VPDPBUSDS512 ...)
+(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 ...) => (VPDPBUSDS128 ...)
+(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 ...) => (VPDPBUSDS256 ...)
 (SignInt16x16 ...) => (VPSIGNW256 ...)
 (SignInt16x8 ...) => (VPSIGNW128 ...)
 (SignInt32x4 ...) => (VPSIGND128 ...)
 (TruncWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x)
 (TruncWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x)
 (TruncWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x)
+(UnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSD512 ...)
+(UnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSD128 ...)
+(UnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSD256 ...)
+(UnsignedSignedQuadDotProdAccumulateUint32x16 ...) => (VPDPBUSD512 ...)
+(UnsignedSignedQuadDotProdAccumulateUint32x4 ...) => (VPDPBUSD128 ...)
+(UnsignedSignedQuadDotProdAccumulateUint32x8 ...) => (VPDPBUSD256 ...)
 (XorFloat32x16 ...) => (VXORPS512 ...)
 (XorFloat32x4 ...) => (VXORPS128 ...)
 (XorFloat32x8 ...) => (VXORPS256 ...)
index f580973c9dcb19067ed79886d6bfd4c973bf80b0..6cc405c0300fcbd5c6b618fff3f4d876ccdc8547 100644 (file)
@@ -283,15 +283,23 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
                {name: "VPMINSDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false},
                {name: "VPMULLDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec512", resultInArg0: false},
                {name: "VPORDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPORD", commutative: true, typ: "Vec512", resultInArg0: false},
+               {name: "VPDPWSSDMasked512", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
                {name: "VPOPCNTDMasked512", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPDPWSSDSMasked512", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPDPBUSDSMasked512", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", resultInArg0: true},
                {name: "VPSUBDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPSUBD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPDPBUSDMasked512", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSD", commutative: false, typ: "Vec512", resultInArg0: true},
                {name: "VPXORDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPXORD", commutative: true, typ: "Vec512", resultInArg0: false},
                {name: "VPMAXSD512", argLength: 2, reg: fp21, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false},
                {name: "VPMINSD512", argLength: 2, reg: fp21, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false},
                {name: "VPMULLD512", argLength: 2, reg: fp21, asm: "VPMULLD", commutative: true, typ: "Vec512", resultInArg0: false},
                {name: "VPORD512", argLength: 2, reg: fp21, asm: "VPORD", commutative: true, typ: "Vec512", resultInArg0: false},
+               {name: "VPDPWSSD512", argLength: 3, reg: fp31, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
                {name: "VPOPCNTD512", argLength: 1, reg: fp11, asm: "VPOPCNTD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPDPWSSDS512", argLength: 3, reg: fp31, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPDPBUSDS512", argLength: 3, reg: fp31, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", resultInArg0: true},
                {name: "VPSUBD512", argLength: 2, reg: fp21, asm: "VPSUBD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPDPBUSD512", argLength: 3, reg: fp31, asm: "VPDPBUSD", commutative: false, typ: "Vec512", resultInArg0: true},
                {name: "VPXORD512", argLength: 2, reg: fp21, asm: "VPXORD", commutative: true, typ: "Vec512", resultInArg0: false},
                {name: "VPABSD128", argLength: 1, reg: fp11, asm: "VPABSD", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPADDD128", argLength: 2, reg: fp21, asm: "VPADDD", commutative: true, typ: "Vec128", resultInArg0: false},
@@ -307,18 +315,26 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
                {name: "VPMINSDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPMULLDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPORDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPORD", commutative: true, typ: "Vec128", resultInArg0: false},
+               {name: "VPDPWSSDMasked128", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
                {name: "VPOPCNTDMasked128", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPDPWSSDSMasked128", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPDPBUSDSMasked128", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", resultInArg0: true},
                {name: "VPSUBDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPSUBD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPDPBUSDMasked128", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSD", commutative: false, typ: "Vec128", resultInArg0: true},
                {name: "VPXORDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPXORD", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPMAXSD128", argLength: 2, reg: fp21, asm: "VPMAXSD", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPMINSD128", argLength: 2, reg: fp21, asm: "VPMINSD", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPMULDQ128", argLength: 2, reg: fp21, asm: "VPMULDQ", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPMULLD128", argLength: 2, reg: fp21, asm: "VPMULLD", commutative: true, typ: "Vec128", resultInArg0: false},
+               {name: "VPDPWSSD128", argLength: 3, reg: fp31, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
                {name: "VPHADDD128", argLength: 2, reg: fp21, asm: "VPHADDD", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPHSUBD128", argLength: 2, reg: fp21, asm: "VPHSUBD", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPOPCNTD128", argLength: 1, reg: fp11, asm: "VPOPCNTD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPDPWSSDS128", argLength: 3, reg: fp31, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPDPBUSDS128", argLength: 3, reg: fp31, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", resultInArg0: true},
                {name: "VPSIGND128", argLength: 2, reg: fp21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPSUBD128", argLength: 2, reg: fp21, asm: "VPSUBD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPDPBUSD128", argLength: 3, reg: fp31, asm: "VPDPBUSD", commutative: false, typ: "Vec128", resultInArg0: true},
                {name: "VPABSD256", argLength: 1, reg: fp11, asm: "VPABSD", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPADDD256", argLength: 2, reg: fp21, asm: "VPADDD", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPCMPEQD256", argLength: 2, reg: fp21, asm: "VPCMPEQD", commutative: true, typ: "Vec256", resultInArg0: false},
@@ -333,18 +349,26 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
                {name: "VPMINSDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPMULLDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPORDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPORD", commutative: true, typ: "Vec256", resultInArg0: false},
+               {name: "VPDPWSSDMasked256", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
                {name: "VPOPCNTDMasked256", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPDPWSSDSMasked256", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPDPBUSDSMasked256", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", resultInArg0: true},
                {name: "VPSUBDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPSUBD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPDPBUSDMasked256", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSD", commutative: false, typ: "Vec256", resultInArg0: true},
                {name: "VPXORDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPXORD", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPMAXSD256", argLength: 2, reg: fp21, asm: "VPMAXSD", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPMINSD256", argLength: 2, reg: fp21, asm: "VPMINSD", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPMULDQ256", argLength: 2, reg: fp21, asm: "VPMULDQ", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPMULLD256", argLength: 2, reg: fp21, asm: "VPMULLD", commutative: true, typ: "Vec256", resultInArg0: false},
+               {name: "VPDPWSSD256", argLength: 3, reg: fp31, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
                {name: "VPHADDD256", argLength: 2, reg: fp21, asm: "VPHADDD", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPHSUBD256", argLength: 2, reg: fp21, asm: "VPHSUBD", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPOPCNTD256", argLength: 1, reg: fp11, asm: "VPOPCNTD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPDPWSSDS256", argLength: 3, reg: fp31, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPDPBUSDS256", argLength: 3, reg: fp31, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", resultInArg0: true},
                {name: "VPSIGND256", argLength: 2, reg: fp21, asm: "VPSIGND", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPSUBD256", argLength: 2, reg: fp21, asm: "VPSUBD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPDPBUSD256", argLength: 3, reg: fp31, asm: "VPDPBUSD", commutative: false, typ: "Vec256", resultInArg0: true},
                {name: "VPABSQ128", argLength: 1, reg: fp11, asm: "VPABSQ", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPADDQ128", argLength: 2, reg: fp21, asm: "VPADDQ", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPCMPEQQ128", argLength: 2, reg: fp21, asm: "VPCMPEQQ", commutative: true, typ: "Vec128", resultInArg0: false},
index 3e3411e0dfb1801a5fd3e8d3a4e08f22199bd5dd..404f1fc69fd07d052c4b0f74d3956ca27b084492 100644 (file)
@@ -427,16 +427,24 @@ func simdGenericOps() []opData {
                {name: "MaskedMulLowInt32x16", argLength: 3, commutative: true},
                {name: "MaskedNotEqualInt32x16", argLength: 3, commutative: true},
                {name: "MaskedOrInt32x16", argLength: 3, commutative: true},
+               {name: "MaskedPairDotProdAccumulateInt32x16", argLength: 4, commutative: false},
                {name: "MaskedPopCountInt32x16", argLength: 2, commutative: false},
+               {name: "MaskedSaturatedPairDotProdAccumulateInt32x16", argLength: 4, commutative: false},
+               {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 4, commutative: false},
                {name: "MaskedSubInt32x16", argLength: 3, commutative: false},
+               {name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 4, commutative: false},
                {name: "MaskedXorInt32x16", argLength: 3, commutative: true},
                {name: "MaxInt32x16", argLength: 2, commutative: true},
                {name: "MinInt32x16", argLength: 2, commutative: true},
                {name: "MulLowInt32x16", argLength: 2, commutative: true},
                {name: "NotEqualInt32x16", argLength: 2, commutative: true},
                {name: "OrInt32x16", argLength: 2, commutative: true},
+               {name: "PairDotProdAccumulateInt32x16", argLength: 3, commutative: false},
                {name: "PopCountInt32x16", argLength: 1, commutative: false},
+               {name: "SaturatedPairDotProdAccumulateInt32x16", argLength: 3, commutative: false},
+               {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 3, commutative: false},
                {name: "SubInt32x16", argLength: 2, commutative: false},
+               {name: "UnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 3, commutative: false},
                {name: "XorInt32x16", argLength: 2, commutative: true},
                {name: "AbsoluteInt32x4", argLength: 1, commutative: false},
                {name: "AddInt32x4", argLength: 2, commutative: true},
@@ -461,8 +469,12 @@ func simdGenericOps() []opData {
                {name: "MaskedMulLowInt32x4", argLength: 3, commutative: true},
                {name: "MaskedNotEqualInt32x4", argLength: 3, commutative: true},
                {name: "MaskedOrInt32x4", argLength: 3, commutative: true},
+               {name: "MaskedPairDotProdAccumulateInt32x4", argLength: 4, commutative: false},
                {name: "MaskedPopCountInt32x4", argLength: 2, commutative: false},
+               {name: "MaskedSaturatedPairDotProdAccumulateInt32x4", argLength: 4, commutative: false},
+               {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 4, commutative: false},
                {name: "MaskedSubInt32x4", argLength: 3, commutative: false},
+               {name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 4, commutative: false},
                {name: "MaskedXorInt32x4", argLength: 3, commutative: true},
                {name: "MaxInt32x4", argLength: 2, commutative: true},
                {name: "MinInt32x4", argLength: 2, commutative: true},
@@ -470,11 +482,15 @@ func simdGenericOps() []opData {
                {name: "MulLowInt32x4", argLength: 2, commutative: true},
                {name: "NotEqualInt32x4", argLength: 2, commutative: true},
                {name: "OrInt32x4", argLength: 2, commutative: true},
+               {name: "PairDotProdAccumulateInt32x4", argLength: 3, commutative: false},
                {name: "PairwiseAddInt32x4", argLength: 2, commutative: false},
                {name: "PairwiseSubInt32x4", argLength: 2, commutative: false},
                {name: "PopCountInt32x4", argLength: 1, commutative: false},
+               {name: "SaturatedPairDotProdAccumulateInt32x4", argLength: 3, commutative: false},
+               {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 3, commutative: false},
                {name: "SignInt32x4", argLength: 2, commutative: false},
                {name: "SubInt32x4", argLength: 2, commutative: false},
+               {name: "UnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 3, commutative: false},
                {name: "XorInt32x4", argLength: 2, commutative: true},
                {name: "AbsoluteInt32x8", argLength: 1, commutative: false},
                {name: "AddInt32x8", argLength: 2, commutative: true},
@@ -499,8 +515,12 @@ func simdGenericOps() []opData {
                {name: "MaskedMulLowInt32x8", argLength: 3, commutative: true},
                {name: "MaskedNotEqualInt32x8", argLength: 3, commutative: true},
                {name: "MaskedOrInt32x8", argLength: 3, commutative: true},
+               {name: "MaskedPairDotProdAccumulateInt32x8", argLength: 4, commutative: false},
                {name: "MaskedPopCountInt32x8", argLength: 2, commutative: false},
+               {name: "MaskedSaturatedPairDotProdAccumulateInt32x8", argLength: 4, commutative: false},
+               {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 4, commutative: false},
                {name: "MaskedSubInt32x8", argLength: 3, commutative: false},
+               {name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 4, commutative: false},
                {name: "MaskedXorInt32x8", argLength: 3, commutative: true},
                {name: "MaxInt32x8", argLength: 2, commutative: true},
                {name: "MinInt32x8", argLength: 2, commutative: true},
@@ -508,11 +528,15 @@ func simdGenericOps() []opData {
                {name: "MulLowInt32x8", argLength: 2, commutative: true},
                {name: "NotEqualInt32x8", argLength: 2, commutative: true},
                {name: "OrInt32x8", argLength: 2, commutative: true},
+               {name: "PairDotProdAccumulateInt32x8", argLength: 3, commutative: false},
                {name: "PairwiseAddInt32x8", argLength: 2, commutative: false},
                {name: "PairwiseSubInt32x8", argLength: 2, commutative: false},
                {name: "PopCountInt32x8", argLength: 1, commutative: false},
+               {name: "SaturatedPairDotProdAccumulateInt32x8", argLength: 3, commutative: false},
+               {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 3, commutative: false},
                {name: "SignInt32x8", argLength: 2, commutative: false},
                {name: "SubInt32x8", argLength: 2, commutative: false},
+               {name: "UnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 3, commutative: false},
                {name: "XorInt32x8", argLength: 2, commutative: true},
                {name: "AbsoluteInt64x2", argLength: 1, commutative: false},
                {name: "AddInt64x2", argLength: 2, commutative: true},
@@ -845,14 +869,18 @@ func simdGenericOps() []opData {
                {name: "MaskedNotEqualUint32x16", argLength: 3, commutative: true},
                {name: "MaskedOrUint32x16", argLength: 3, commutative: true},
                {name: "MaskedPopCountUint32x16", argLength: 2, commutative: false},
+               {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 4, commutative: false},
                {name: "MaskedSubUint32x16", argLength: 3, commutative: false},
+               {name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 4, commutative: false},
                {name: "MaskedXorUint32x16", argLength: 3, commutative: true},
                {name: "MaxUint32x16", argLength: 2, commutative: true},
                {name: "MinUint32x16", argLength: 2, commutative: true},
                {name: "NotEqualUint32x16", argLength: 2, commutative: true},
                {name: "OrUint32x16", argLength: 2, commutative: true},
                {name: "PopCountUint32x16", argLength: 1, commutative: false},
+               {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 3, commutative: false},
                {name: "SubUint32x16", argLength: 2, commutative: false},
+               {name: "UnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 3, commutative: false},
                {name: "XorUint32x16", argLength: 2, commutative: true},
                {name: "AddUint32x4", argLength: 2, commutative: true},
                {name: "AndUint32x4", argLength: 2, commutative: true},
@@ -875,7 +903,9 @@ func simdGenericOps() []opData {
                {name: "MaskedNotEqualUint32x4", argLength: 3, commutative: true},
                {name: "MaskedOrUint32x4", argLength: 3, commutative: true},
                {name: "MaskedPopCountUint32x4", argLength: 2, commutative: false},
+               {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 4, commutative: false},
                {name: "MaskedSubUint32x4", argLength: 3, commutative: false},
+               {name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 4, commutative: false},
                {name: "MaskedXorUint32x4", argLength: 3, commutative: true},
                {name: "MaxUint32x4", argLength: 2, commutative: true},
                {name: "MinUint32x4", argLength: 2, commutative: true},
@@ -885,7 +915,9 @@ func simdGenericOps() []opData {
                {name: "PairwiseAddUint32x4", argLength: 2, commutative: false},
                {name: "PairwiseSubUint32x4", argLength: 2, commutative: false},
                {name: "PopCountUint32x4", argLength: 1, commutative: false},
+               {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 3, commutative: false},
                {name: "SubUint32x4", argLength: 2, commutative: false},
+               {name: "UnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 3, commutative: false},
                {name: "XorUint32x4", argLength: 2, commutative: true},
                {name: "AddUint32x8", argLength: 2, commutative: true},
                {name: "AndUint32x8", argLength: 2, commutative: true},
@@ -908,7 +940,9 @@ func simdGenericOps() []opData {
                {name: "MaskedNotEqualUint32x8", argLength: 3, commutative: true},
                {name: "MaskedOrUint32x8", argLength: 3, commutative: true},
                {name: "MaskedPopCountUint32x8", argLength: 2, commutative: false},
+               {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 4, commutative: false},
                {name: "MaskedSubUint32x8", argLength: 3, commutative: false},
+               {name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 4, commutative: false},
                {name: "MaskedXorUint32x8", argLength: 3, commutative: true},
                {name: "MaxUint32x8", argLength: 2, commutative: true},
                {name: "MinUint32x8", argLength: 2, commutative: true},
@@ -918,7 +952,9 @@ func simdGenericOps() []opData {
                {name: "PairwiseAddUint32x8", argLength: 2, commutative: false},
                {name: "PairwiseSubUint32x8", argLength: 2, commutative: false},
                {name: "PopCountUint32x8", argLength: 1, commutative: false},
+               {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 3, commutative: false},
                {name: "SubUint32x8", argLength: 2, commutative: false},
+               {name: "UnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 3, commutative: false},
                {name: "XorUint32x8", argLength: 2, commutative: true},
                {name: "AddUint64x2", argLength: 2, commutative: true},
                {name: "AndUint64x2", argLength: 2, commutative: true},
index 3ef08ae5559121cec6f7f9643eacdd9b923b3b1b..26facad933461bf9905cab43087344ddaa810ed3 100644 (file)
@@ -1476,15 +1476,23 @@ const (
        OpAMD64VPMINSDMasked512
        OpAMD64VPMULLDMasked512
        OpAMD64VPORDMasked512
+       OpAMD64VPDPWSSDMasked512
        OpAMD64VPOPCNTDMasked512
+       OpAMD64VPDPWSSDSMasked512
+       OpAMD64VPDPBUSDSMasked512
        OpAMD64VPSUBDMasked512
+       OpAMD64VPDPBUSDMasked512
        OpAMD64VPXORDMasked512
        OpAMD64VPMAXSD512
        OpAMD64VPMINSD512
        OpAMD64VPMULLD512
        OpAMD64VPORD512
+       OpAMD64VPDPWSSD512
        OpAMD64VPOPCNTD512
+       OpAMD64VPDPWSSDS512
+       OpAMD64VPDPBUSDS512
        OpAMD64VPSUBD512
+       OpAMD64VPDPBUSD512
        OpAMD64VPXORD512
        OpAMD64VPABSD128
        OpAMD64VPADDD128
@@ -1500,18 +1508,26 @@ const (
        OpAMD64VPMINSDMasked128
        OpAMD64VPMULLDMasked128
        OpAMD64VPORDMasked128
+       OpAMD64VPDPWSSDMasked128
        OpAMD64VPOPCNTDMasked128
+       OpAMD64VPDPWSSDSMasked128
+       OpAMD64VPDPBUSDSMasked128
        OpAMD64VPSUBDMasked128
+       OpAMD64VPDPBUSDMasked128
        OpAMD64VPXORDMasked128
        OpAMD64VPMAXSD128
        OpAMD64VPMINSD128
        OpAMD64VPMULDQ128
        OpAMD64VPMULLD128
+       OpAMD64VPDPWSSD128
        OpAMD64VPHADDD128
        OpAMD64VPHSUBD128
        OpAMD64VPOPCNTD128
+       OpAMD64VPDPWSSDS128
+       OpAMD64VPDPBUSDS128
        OpAMD64VPSIGND128
        OpAMD64VPSUBD128
+       OpAMD64VPDPBUSD128
        OpAMD64VPABSD256
        OpAMD64VPADDD256
        OpAMD64VPCMPEQD256
@@ -1526,18 +1542,26 @@ const (
        OpAMD64VPMINSDMasked256
        OpAMD64VPMULLDMasked256
        OpAMD64VPORDMasked256
+       OpAMD64VPDPWSSDMasked256
        OpAMD64VPOPCNTDMasked256
+       OpAMD64VPDPWSSDSMasked256
+       OpAMD64VPDPBUSDSMasked256
        OpAMD64VPSUBDMasked256
+       OpAMD64VPDPBUSDMasked256
        OpAMD64VPXORDMasked256
        OpAMD64VPMAXSD256
        OpAMD64VPMINSD256
        OpAMD64VPMULDQ256
        OpAMD64VPMULLD256
+       OpAMD64VPDPWSSD256
        OpAMD64VPHADDD256
        OpAMD64VPHSUBD256
        OpAMD64VPOPCNTD256
+       OpAMD64VPDPWSSDS256
+       OpAMD64VPDPBUSDS256
        OpAMD64VPSIGND256
        OpAMD64VPSUBD256
+       OpAMD64VPDPBUSD256
        OpAMD64VPABSQ128
        OpAMD64VPADDQ128
        OpAMD64VPCMPEQQ128
@@ -4491,16 +4515,24 @@ const (
        OpMaskedMulLowInt32x16
        OpMaskedNotEqualInt32x16
        OpMaskedOrInt32x16
+       OpMaskedPairDotProdAccumulateInt32x16
        OpMaskedPopCountInt32x16
+       OpMaskedSaturatedPairDotProdAccumulateInt32x16
+       OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16
        OpMaskedSubInt32x16
+       OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16
        OpMaskedXorInt32x16
        OpMaxInt32x16
        OpMinInt32x16
        OpMulLowInt32x16
        OpNotEqualInt32x16
        OpOrInt32x16
+       OpPairDotProdAccumulateInt32x16
        OpPopCountInt32x16
+       OpSaturatedPairDotProdAccumulateInt32x16
+       OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16
        OpSubInt32x16
+       OpUnsignedSignedQuadDotProdAccumulateInt32x16
        OpXorInt32x16
        OpAbsoluteInt32x4
        OpAddInt32x4
@@ -4525,8 +4557,12 @@ const (
        OpMaskedMulLowInt32x4
        OpMaskedNotEqualInt32x4
        OpMaskedOrInt32x4
+       OpMaskedPairDotProdAccumulateInt32x4
        OpMaskedPopCountInt32x4
+       OpMaskedSaturatedPairDotProdAccumulateInt32x4
+       OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4
        OpMaskedSubInt32x4
+       OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4
        OpMaskedXorInt32x4
        OpMaxInt32x4
        OpMinInt32x4
@@ -4534,11 +4570,15 @@ const (
        OpMulLowInt32x4
        OpNotEqualInt32x4
        OpOrInt32x4
+       OpPairDotProdAccumulateInt32x4
        OpPairwiseAddInt32x4
        OpPairwiseSubInt32x4
        OpPopCountInt32x4
+       OpSaturatedPairDotProdAccumulateInt32x4
+       OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4
        OpSignInt32x4
        OpSubInt32x4
+       OpUnsignedSignedQuadDotProdAccumulateInt32x4
        OpXorInt32x4
        OpAbsoluteInt32x8
        OpAddInt32x8
@@ -4563,8 +4603,12 @@ const (
        OpMaskedMulLowInt32x8
        OpMaskedNotEqualInt32x8
        OpMaskedOrInt32x8
+       OpMaskedPairDotProdAccumulateInt32x8
        OpMaskedPopCountInt32x8
+       OpMaskedSaturatedPairDotProdAccumulateInt32x8
+       OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8
        OpMaskedSubInt32x8
+       OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8
        OpMaskedXorInt32x8
        OpMaxInt32x8
        OpMinInt32x8
@@ -4572,11 +4616,15 @@ const (
        OpMulLowInt32x8
        OpNotEqualInt32x8
        OpOrInt32x8
+       OpPairDotProdAccumulateInt32x8
        OpPairwiseAddInt32x8
        OpPairwiseSubInt32x8
        OpPopCountInt32x8
+       OpSaturatedPairDotProdAccumulateInt32x8
+       OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8
        OpSignInt32x8
        OpSubInt32x8
+       OpUnsignedSignedQuadDotProdAccumulateInt32x8
        OpXorInt32x8
        OpAbsoluteInt64x2
        OpAddInt64x2
@@ -4909,14 +4957,18 @@ const (
        OpMaskedNotEqualUint32x16
        OpMaskedOrUint32x16
        OpMaskedPopCountUint32x16
+       OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16
        OpMaskedSubUint32x16
+       OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16
        OpMaskedXorUint32x16
        OpMaxUint32x16
        OpMinUint32x16
        OpNotEqualUint32x16
        OpOrUint32x16
        OpPopCountUint32x16
+       OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16
        OpSubUint32x16
+       OpUnsignedSignedQuadDotProdAccumulateUint32x16
        OpXorUint32x16
        OpAddUint32x4
        OpAndUint32x4
@@ -4939,7 +4991,9 @@ const (
        OpMaskedNotEqualUint32x4
        OpMaskedOrUint32x4
        OpMaskedPopCountUint32x4
+       OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4
        OpMaskedSubUint32x4
+       OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4
        OpMaskedXorUint32x4
        OpMaxUint32x4
        OpMinUint32x4
@@ -4949,7 +5003,9 @@ const (
        OpPairwiseAddUint32x4
        OpPairwiseSubUint32x4
        OpPopCountUint32x4
+       OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4
        OpSubUint32x4
+       OpUnsignedSignedQuadDotProdAccumulateUint32x4
        OpXorUint32x4
        OpAddUint32x8
        OpAndUint32x8
@@ -4972,7 +5028,9 @@ const (
        OpMaskedNotEqualUint32x8
        OpMaskedOrUint32x8
        OpMaskedPopCountUint32x8
+       OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8
        OpMaskedSubUint32x8
+       OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8
        OpMaskedXorUint32x8
        OpMaxUint32x8
        OpMinUint32x8
@@ -4982,7 +5040,9 @@ const (
        OpPairwiseAddUint32x8
        OpPairwiseSubUint32x8
        OpPopCountUint32x8
+       OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8
        OpSubUint32x8
+       OpUnsignedSignedQuadDotProdAccumulateUint32x8
        OpXorUint32x8
        OpAddUint64x2
        OpAndUint64x2
@@ -22116,6 +22176,23 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPOPCNTDMasked512",
                argLen: 2,
@@ -22130,6 +22207,40 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDSMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPDPBUSDSMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPSUBDMasked512",
                argLen: 3,
@@ -22145,6 +22256,23 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPBUSDMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:        "VPXORDMasked512",
                argLen:      3,
@@ -22221,6 +22349,22 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSD512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPOPCNTD512",
                argLen: 1,
@@ -22234,6 +22378,38 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDS512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPDPBUSDS512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPSUBD512",
                argLen: 2,
@@ -22248,6 +22424,22 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPBUSD512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:        "VPXORD512",
                argLen:      2,
@@ -22477,6 +22669,23 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPOPCNTDMasked128",
                argLen: 2,
@@ -22491,6 +22700,40 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDSMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPDPBUSDSMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPSUBDMasked128",
                argLen: 3,
@@ -22506,6 +22749,23 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPBUSDMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:        "VPXORDMasked128",
                argLen:      3,
@@ -22582,6 +22842,22 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSD128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPHADDD128",
                argLen: 2,
@@ -22623,6 +22899,38 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDS128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPDPBUSDS128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPSIGND128",
                argLen: 2,
@@ -22651,6 +22959,22 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPBUSD128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPABSD256",
                argLen: 1,
@@ -22865,6 +23189,23 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPOPCNTDMasked256",
                argLen: 2,
@@ -22879,6 +23220,40 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDSMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPDPBUSDSMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPSUBDMasked256",
                argLen: 3,
@@ -22894,6 +23269,23 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPBUSDMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},    // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:        "VPXORDMasked256",
                argLen:      3,
@@ -22970,6 +23362,22 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSD256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPHADDD256",
                argLen: 2,
@@ -23011,6 +23419,38 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPWSSDS256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPWSSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPDPBUSDS256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPSIGND256",
                argLen: 2,
@@ -23039,6 +23479,22 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "VPDPBUSD256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPDPBUSD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:   "VPABSQ128",
                argLen: 1,
@@ -57134,16 +57590,36 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "MaskedPairDotProdAccumulateInt32x16",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedPopCountInt32x16",
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "MaskedSaturatedPairDotProdAccumulateInt32x16",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedSubInt32x16",
                argLen:  3,
                generic: true,
        },
+       {
+               name:    "MaskedUnsignedSignedQuadDotProdAccumulateInt32x16",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:        "MaskedXorInt32x16",
                argLen:      3,
@@ -57180,16 +57656,36 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "PairDotProdAccumulateInt32x16",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "PopCountInt32x16",
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "SaturatedPairDotProdAccumulateInt32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "SubInt32x16",
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "UnsignedSignedQuadDotProdAccumulateInt32x16",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:        "XorInt32x16",
                argLen:      2,
@@ -57324,16 +57820,36 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "MaskedPairDotProdAccumulateInt32x4",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedPopCountInt32x4",
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "MaskedSaturatedPairDotProdAccumulateInt32x4",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedSubInt32x4",
                argLen:  3,
                generic: true,
        },
+       {
+               name:    "MaskedUnsignedSignedQuadDotProdAccumulateInt32x4",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:        "MaskedXorInt32x4",
                argLen:      3,
@@ -57376,6 +57892,11 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "PairDotProdAccumulateInt32x4",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "PairwiseAddInt32x4",
                argLen:  2,
@@ -57391,6 +57912,16 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "SaturatedPairDotProdAccumulateInt32x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "SignInt32x4",
                argLen:  2,
@@ -57401,6 +57932,11 @@ var opcodeTable = [...]opInfo{
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "UnsignedSignedQuadDotProdAccumulateInt32x4",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:        "XorInt32x4",
                argLen:      2,
@@ -57535,16 +58071,36 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "MaskedPairDotProdAccumulateInt32x8",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedPopCountInt32x8",
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "MaskedSaturatedPairDotProdAccumulateInt32x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedSubInt32x8",
                argLen:  3,
                generic: true,
        },
+       {
+               name:    "MaskedUnsignedSignedQuadDotProdAccumulateInt32x8",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:        "MaskedXorInt32x8",
                argLen:      3,
@@ -57587,6 +58143,11 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "PairDotProdAccumulateInt32x8",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "PairwiseAddInt32x8",
                argLen:  2,
@@ -57602,6 +58163,16 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "SaturatedPairDotProdAccumulateInt32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "SignInt32x8",
                argLen:  2,
@@ -57612,6 +58183,11 @@ var opcodeTable = [...]opInfo{
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "UnsignedSignedQuadDotProdAccumulateInt32x8",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:        "XorInt32x8",
                argLen:      2,
@@ -59451,11 +60027,21 @@ var opcodeTable = [...]opInfo{
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedSubUint32x16",
                argLen:  3,
                generic: true,
        },
+       {
+               name:    "MaskedUnsignedSignedQuadDotProdAccumulateUint32x16",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:        "MaskedXorUint32x16",
                argLen:      3,
@@ -59491,11 +60077,21 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "SubUint32x16",
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "UnsignedSignedQuadDotProdAccumulateUint32x16",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:        "XorUint32x16",
                argLen:      2,
@@ -59619,11 +60215,21 @@ var opcodeTable = [...]opInfo{
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedSubUint32x4",
                argLen:  3,
                generic: true,
        },
+       {
+               name:    "MaskedUnsignedSignedQuadDotProdAccumulateUint32x4",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:        "MaskedXorUint32x4",
                argLen:      3,
@@ -59675,11 +60281,21 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "SubUint32x4",
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "UnsignedSignedQuadDotProdAccumulateUint32x4",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:        "XorUint32x4",
                argLen:      2,
@@ -59803,11 +60419,21 @@ var opcodeTable = [...]opInfo{
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:    "MaskedSubUint32x8",
                argLen:  3,
                generic: true,
        },
+       {
+               name:    "MaskedUnsignedSignedQuadDotProdAccumulateUint32x8",
+               argLen:  4,
+               generic: true,
+       },
        {
                name:        "MaskedXorUint32x8",
                argLen:      3,
@@ -59859,11 +60485,21 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:    "SubUint32x8",
                argLen:  2,
                generic: true,
        },
+       {
+               name:    "UnsignedSignedQuadDotProdAccumulateUint32x8",
+               argLen:  3,
+               generic: true,
+       },
        {
                name:        "XorUint32x8",
                argLen:      2,
index 3605e75213c542f39934a3dad88b238889968f4f..60469f49d944da177d426c5005a97fbfe7d47d94 100644 (file)
@@ -2696,6 +2696,12 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpMaskedOrUint64x4(v)
        case OpMaskedOrUint64x8:
                return rewriteValueAMD64_OpMaskedOrUint64x8(v)
+       case OpMaskedPairDotProdAccumulateInt32x16:
+               return rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x16(v)
+       case OpMaskedPairDotProdAccumulateInt32x4:
+               return rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x4(v)
+       case OpMaskedPairDotProdAccumulateInt32x8:
+               return rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x8(v)
        case OpMaskedPairDotProdInt16x16:
                return rewriteValueAMD64_OpMaskedPairDotProdInt16x16(v)
        case OpMaskedPairDotProdInt16x32:
@@ -2798,6 +2804,12 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpMaskedSaturatedAddUint8x32(v)
        case OpMaskedSaturatedAddUint8x64:
                return rewriteValueAMD64_OpMaskedSaturatedAddUint8x64(v)
+       case OpMaskedSaturatedPairDotProdAccumulateInt32x16:
+               return rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x16(v)
+       case OpMaskedSaturatedPairDotProdAccumulateInt32x4:
+               return rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x4(v)
+       case OpMaskedSaturatedPairDotProdAccumulateInt32x8:
+               return rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x8(v)
        case OpMaskedSaturatedSubInt16x16:
                return rewriteValueAMD64_OpMaskedSaturatedSubInt16x16(v)
        case OpMaskedSaturatedSubInt16x32:
@@ -2828,6 +2840,18 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedPairDotProdUint16x32(v)
        case OpMaskedSaturatedUnsignedSignedPairDotProdUint16x8:
                return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedPairDotProdUint16x8(v)
+       case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16:
+               return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16(v)
+       case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4:
+               return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4(v)
+       case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8:
+               return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8(v)
+       case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16:
+               return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16(v)
+       case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4:
+               return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4(v)
+       case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8:
+               return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8(v)
        case OpMaskedSqrtFloat32x16:
                return rewriteValueAMD64_OpMaskedSqrtFloat32x16(v)
        case OpMaskedSqrtFloat32x4:
@@ -2924,6 +2948,18 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x4(v)
        case OpMaskedTruncWithPrecisionFloat64x8:
                return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x8(v)
+       case OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16:
+               return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16(v)
+       case OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4:
+               return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4(v)
+       case OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8:
+               return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8(v)
+       case OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16:
+               return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16(v)
+       case OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4:
+               return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4(v)
+       case OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8:
+               return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8(v)
        case OpMaskedXorFloat32x16:
                return rewriteValueAMD64_OpMaskedXorFloat32x16(v)
        case OpMaskedXorFloat32x4:
@@ -3490,6 +3526,15 @@ func rewriteValueAMD64(v *Value) bool {
        case OpOrUint8x32:
                v.Op = OpAMD64VPOR256
                return true
+       case OpPairDotProdAccumulateInt32x16:
+               v.Op = OpAMD64VPDPWSSD512
+               return true
+       case OpPairDotProdAccumulateInt32x4:
+               v.Op = OpAMD64VPDPWSSD128
+               return true
+       case OpPairDotProdAccumulateInt32x8:
+               v.Op = OpAMD64VPDPWSSD256
+               return true
        case OpPairDotProdInt16x16:
                v.Op = OpAMD64VPMADDWD256
                return true
@@ -3813,6 +3858,15 @@ func rewriteValueAMD64(v *Value) bool {
        case OpSaturatedAddUint8x64:
                v.Op = OpAMD64VPADDSB512
                return true
+       case OpSaturatedPairDotProdAccumulateInt32x16:
+               v.Op = OpAMD64VPDPWSSDS512
+               return true
+       case OpSaturatedPairDotProdAccumulateInt32x4:
+               v.Op = OpAMD64VPDPWSSDS128
+               return true
+       case OpSaturatedPairDotProdAccumulateInt32x8:
+               v.Op = OpAMD64VPDPWSSDS256
+               return true
        case OpSaturatedPairwiseAddInt16x16:
                v.Op = OpAMD64VPHADDSW256
                return true
@@ -3876,6 +3930,24 @@ func rewriteValueAMD64(v *Value) bool {
        case OpSaturatedUnsignedSignedPairDotProdUint8x32:
                v.Op = OpAMD64VPMADDUBSW256
                return true
+       case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16:
+               v.Op = OpAMD64VPDPBUSDS512
+               return true
+       case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4:
+               v.Op = OpAMD64VPDPBUSDS128
+               return true
+       case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8:
+               v.Op = OpAMD64VPDPBUSDS256
+               return true
+       case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16:
+               v.Op = OpAMD64VPDPBUSDS512
+               return true
+       case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4:
+               v.Op = OpAMD64VPDPBUSDS128
+               return true
+       case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8:
+               v.Op = OpAMD64VPDPBUSDS256
+               return true
        case OpSelect0:
                return rewriteValueAMD64_OpSelect0(v)
        case OpSelect1:
@@ -4119,6 +4191,24 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v)
        case OpTruncWithPrecisionFloat64x8:
                return rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v)
+       case OpUnsignedSignedQuadDotProdAccumulateInt32x16:
+               v.Op = OpAMD64VPDPBUSD512
+               return true
+       case OpUnsignedSignedQuadDotProdAccumulateInt32x4:
+               v.Op = OpAMD64VPDPBUSD128
+               return true
+       case OpUnsignedSignedQuadDotProdAccumulateInt32x8:
+               v.Op = OpAMD64VPDPBUSD256
+               return true
+       case OpUnsignedSignedQuadDotProdAccumulateUint32x16:
+               v.Op = OpAMD64VPDPBUSD512
+               return true
+       case OpUnsignedSignedQuadDotProdAccumulateUint32x4:
+               v.Op = OpAMD64VPDPBUSD128
+               return true
+       case OpUnsignedSignedQuadDotProdAccumulateUint32x8:
+               v.Op = OpAMD64VPDPBUSD256
+               return true
        case OpWB:
                v.Op = OpAMD64LoweredWB
                return true
@@ -42772,6 +42862,66 @@ func rewriteValueAMD64_OpMaskedOrUint64x8(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedPairDotProdAccumulateInt32x16 x y z mask)
+       // result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPWSSDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedPairDotProdAccumulateInt32x4 x y z mask)
+       // result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPWSSDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedPairDotProdAccumulateInt32x8 x y z mask)
+       // result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPWSSDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
 func rewriteValueAMD64_OpMaskedPairDotProdInt16x16(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
@@ -43642,6 +43792,66 @@ func rewriteValueAMD64_OpMaskedSaturatedAddUint8x64(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedPairDotProdAccumulateInt32x16 x y z mask)
+       // result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPWSSDSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedPairDotProdAccumulateInt32x4 x y z mask)
+       // result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPWSSDSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedPairDotProdAccumulateInt32x8 x y z mask)
+       // result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPWSSDSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
 func rewriteValueAMD64_OpMaskedSaturatedSubInt16x16(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
@@ -43912,6 +44122,126 @@ func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedPairDotProdUint16x8(v *Val
                return true
        }
 }
+func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 x y z mask)
+       // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 x y z mask)
+       // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 x y z mask)
+       // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 x y z mask)
+       // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 x y z mask)
+       // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 x y z mask)
+       // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
 func rewriteValueAMD64_OpMaskedSqrtFloat32x16(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
@@ -44764,6 +45094,126 @@ func rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x8(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedUnsignedSignedQuadDotProdAccumulateInt32x16 x y z mask)
+       // result: (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedUnsignedSignedQuadDotProdAccumulateInt32x4 x y z mask)
+       // result: (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedUnsignedSignedQuadDotProdAccumulateInt32x8 x y z mask)
+       // result: (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedUnsignedSignedQuadDotProdAccumulateUint32x16 x y z mask)
+       // result: (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedUnsignedSignedQuadDotProdAccumulateUint32x4 x y z mask)
+       // result: (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (MaskedUnsignedSignedQuadDotProdAccumulateUint32x8 x y z mask)
+       // result: (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPDPBUSDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
 func rewriteValueAMD64_OpMaskedXorFloat32x16(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
index 7ac5f74246edc89d72a7164a07660ff9748b1fbb..b7b80a706311ea9b30610c05a5611ed6872fe4bc 100644 (file)
@@ -833,6 +833,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Int32x16.MaskedOr", opLen3(ssa.OpMaskedOrInt32x16, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Int32x16.MaskedSub", opLen3(ssa.OpMaskedSubInt32x16, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Int32x16.MaskedXor", opLen3(ssa.OpMaskedXorInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Int32x4.MaskedAdd", opLen3(ssa.OpMaskedAddInt32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int32x4.MaskedAnd", opLen3(ssa.OpMaskedAndInt32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int32x4.MaskedAndNot", opLen3(ssa.OpMaskedAndNotInt32x4, types.TypeVec128), sys.AMD64)
@@ -848,6 +852,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Int32x4.MaskedOr", opLen3(ssa.OpMaskedOrInt32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int32x4.MaskedSub", opLen3(ssa.OpMaskedSubInt32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int32x4.MaskedXor", opLen3(ssa.OpMaskedXorInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int32x8.MaskedAdd", opLen3(ssa.OpMaskedAddInt32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Int32x8.MaskedAnd", opLen3(ssa.OpMaskedAndInt32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Int32x8.MaskedAndNot", opLen3(ssa.OpMaskedAndNotInt32x8, types.TypeVec256), sys.AMD64)
@@ -863,6 +871,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Int32x8.MaskedOr", opLen3(ssa.OpMaskedOrInt32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Int32x8.MaskedSub", opLen3(ssa.OpMaskedSubInt32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Int32x8.MaskedXor", opLen3(ssa.OpMaskedXorInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Int64x2.MaskedAdd", opLen3(ssa.OpMaskedAddInt64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int64x2.MaskedAnd", opLen3(ssa.OpMaskedAndInt64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int64x2.MaskedAndNot", opLen3(ssa.OpMaskedAndNotInt64x2, types.TypeVec128), sys.AMD64)
@@ -1006,6 +1018,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Uint32x16.MaskedOr", opLen3(ssa.OpMaskedOrUint32x16, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Uint32x16.MaskedSub", opLen3(ssa.OpMaskedSubUint32x16, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Uint32x16.MaskedXor", opLen3(ssa.OpMaskedXorUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x16.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Uint32x4.MaskedAdd", opLen3(ssa.OpMaskedAddUint32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint32x4.MaskedAnd", opLen3(ssa.OpMaskedAndUint32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint32x4.MaskedAndNot", opLen3(ssa.OpMaskedAndNotUint32x4, types.TypeVec128), sys.AMD64)
@@ -1020,6 +1034,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Uint32x4.MaskedOr", opLen3(ssa.OpMaskedOrUint32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint32x4.MaskedSub", opLen3(ssa.OpMaskedSubUint32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint32x4.MaskedXor", opLen3(ssa.OpMaskedXorUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint32x8.MaskedAdd", opLen3(ssa.OpMaskedAddUint32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Uint32x8.MaskedAnd", opLen3(ssa.OpMaskedAndUint32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Uint32x8.MaskedAndNot", opLen3(ssa.OpMaskedAndNotUint32x8, types.TypeVec256), sys.AMD64)
@@ -1034,6 +1050,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Uint32x8.MaskedOr", opLen3(ssa.OpMaskedOrUint32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Uint32x8.MaskedSub", opLen3(ssa.OpMaskedSubUint32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Uint32x8.MaskedXor", opLen3(ssa.OpMaskedXorUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x8.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x8.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Uint64x2.MaskedAdd", opLen3(ssa.OpMaskedAddUint64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint64x2.MaskedAnd", opLen3(ssa.OpMaskedAndUint64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint64x2.MaskedAndNot", opLen3(ssa.OpMaskedAndNotUint64x2, types.TypeVec128), sys.AMD64)
@@ -1118,6 +1136,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Uint8x64.MaskedSaturatedAdd", opLen3(ssa.OpMaskedSaturatedAddUint8x64, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Uint8x64.MaskedSaturatedSub", opLen3(ssa.OpMaskedSaturatedSubUint8x64, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Uint8x64.MaskedSub", opLen3(ssa.OpMaskedSubUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.MaskedPairDotProdAccumulate", opLen4(ssa.OpMaskedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.MaskedSaturatedPairDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.MaskedPairDotProdAccumulate", opLen4(ssa.OpMaskedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.MaskedSaturatedPairDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.MaskedPairDotProdAccumulate", opLen4(ssa.OpMaskedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.MaskedSaturatedPairDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x16.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x16.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x4.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x8.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x8.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Float32x16.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
        addF(simdPackage, "Float32x4.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
        addF(simdPackage, "Float32x8.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
index 83edaf2270025e62f5db8495efb01d78d00771f9..49af32bc4fca616b25b3f09735a4af00d863f280 100644 (file)
@@ -766,6 +766,7 @@ func (x Float64x2) AndNot(y Float64x2) Float64x2
 func (x Float64x2) Div(y Float64x2) Float64x2
 
 // DotProdBroadcast multiplies all elements and broadcasts the sum.
+// Const Immediate = 127.
 //
 // Asm: VDPPD, CPU Feature: AVX
 func (x Float64x2) DotProdBroadcast(y Float64x2) Float64x2
@@ -4437,6 +4438,26 @@ func (x Int32x16) MaskedSub(y Int32x16, z Mask32x16) Int32x16
 // Asm: VPXORD, CPU Feature: AVX512EVEX
 func (x Int32x16) MaskedXor(y Int32x16, z Mask32x16) Int32x16
 
+// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX512EVEX
+func (x Int32x16) PairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16
+
+// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX512EVEX
+func (x Int32x16) SaturatedPairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
+func (x Int32x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512EVEX
+func (x Int32x16) UnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16
+
 // Add adds corresponding elements of two vectors.
 //
 // Asm: VPADDD, CPU Feature: AVX512EVEX
@@ -4518,6 +4539,26 @@ func (x Int32x4) MaskedSub(y Int32x4, z Mask32x4) Int32x4
 // Asm: VPXORD, CPU Feature: AVX512EVEX
 func (x Int32x4) MaskedXor(y Int32x4, z Mask32x4) Int32x4
 
+// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX_VNNI
+func (x Int32x4) PairDotProdAccumulate(y Int32x4, z Int32x4) Int32x4
+
+// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX_VNNI
+func (x Int32x4) SaturatedPairDotProdAccumulate(y Int32x4, z Int32x4) Int32x4
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX_VNNI
+func (x Int32x4) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint32x4, z Int32x4) Int32x4
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX_VNNI
+func (x Int32x4) UnsignedSignedQuadDotProdAccumulate(y Uint32x4, z Int32x4) Int32x4
+
 // Add adds corresponding elements of two vectors.
 //
 // Asm: VPADDD, CPU Feature: AVX512EVEX
@@ -4599,6 +4640,26 @@ func (x Int32x8) MaskedSub(y Int32x8, z Mask32x8) Int32x8
 // Asm: VPXORD, CPU Feature: AVX512EVEX
 func (x Int32x8) MaskedXor(y Int32x8, z Mask32x8) Int32x8
 
+// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX_VNNI
+func (x Int32x8) PairDotProdAccumulate(y Int32x8, z Int32x8) Int32x8
+
+// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX_VNNI
+func (x Int32x8) SaturatedPairDotProdAccumulate(y Int32x8, z Int32x8) Int32x8
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX_VNNI
+func (x Int32x8) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint32x8, z Int32x8) Int32x8
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX_VNNI
+func (x Int32x8) UnsignedSignedQuadDotProdAccumulate(y Uint32x8, z Int32x8) Int32x8
+
 // Add adds corresponding elements of two vectors.
 //
 // Asm: VPADDQ, CPU Feature: AVX512EVEX
@@ -5380,6 +5441,16 @@ func (x Uint32x16) MaskedSub(y Uint32x16, z Mask32x16) Uint32x16
 // Asm: VPXORD, CPU Feature: AVX512EVEX
 func (x Uint32x16) MaskedXor(y Uint32x16, z Mask32x16) Uint32x16
 
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
+func (x Uint32x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Uint32x16
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512EVEX
+func (x Uint32x16) UnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Uint32x16
+
 // Add adds corresponding elements of two vectors.
 //
 // Asm: VPADDD, CPU Feature: AVX512EVEX
@@ -5456,6 +5527,16 @@ func (x Uint32x4) MaskedSub(y Uint32x4, z Mask32x4) Uint32x4
 // Asm: VPXORD, CPU Feature: AVX512EVEX
 func (x Uint32x4) MaskedXor(y Uint32x4, z Mask32x4) Uint32x4
 
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX_VNNI
+func (x Uint32x4) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint32x4, z Int32x4) Uint32x4
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX_VNNI
+func (x Uint32x4) UnsignedSignedQuadDotProdAccumulate(y Uint32x4, z Int32x4) Uint32x4
+
 // Add adds corresponding elements of two vectors.
 //
 // Asm: VPADDD, CPU Feature: AVX512EVEX
@@ -5532,6 +5613,16 @@ func (x Uint32x8) MaskedSub(y Uint32x8, z Mask32x8) Uint32x8
 // Asm: VPXORD, CPU Feature: AVX512EVEX
 func (x Uint32x8) MaskedXor(y Uint32x8, z Mask32x8) Uint32x8
 
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX_VNNI
+func (x Uint32x8) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint32x8, z Int32x8) Uint32x8
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX_VNNI
+func (x Uint32x8) UnsignedSignedQuadDotProdAccumulate(y Uint32x8, z Int32x8) Uint32x8
+
 // Add adds corresponding elements of two vectors.
 //
 // Asm: VPADDQ, CPU Feature: AVX512EVEX
@@ -5991,6 +6082,96 @@ func (x Uint8x64) MaskedSaturatedSub(y Uint8x64, z Mask8x64) Uint8x64
 // Asm: VPSUBB, CPU Feature: AVX512EVEX
 func (x Uint8x64) MaskedSub(y Uint8x64, z Mask8x64) Uint8x64
 
+// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX512EVEX
+func (x Int32x16) MaskedPairDotProdAccumulate(y Int16x32, z Int32x16, u Mask32x16) Int32x16
+
+// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX512EVEX
+func (x Int32x16) MaskedSaturatedPairDotProdAccumulate(y Int16x32, z Int32x16, u Mask32x16) Int32x16
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
+func (x Int32x16) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16, u Mask32x16) Int32x16
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512EVEX
+func (x Int32x16) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16, u Mask32x16) Int32x16
+
+// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX512EVEX
+func (x Int32x4) MaskedPairDotProdAccumulate(y Int16x8, z Int32x4, u Mask32x4) Int32x4
+
+// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX512EVEX
+func (x Int32x4) MaskedSaturatedPairDotProdAccumulate(y Int16x8, z Int32x4, u Mask32x4) Int32x4
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
+func (x Int32x4) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4, u Mask32x4) Int32x4
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512EVEX
+func (x Int32x4) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4, u Mask32x4) Int32x4
+
+// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSD, CPU Feature: AVX512EVEX
+func (x Int32x8) MaskedPairDotProdAccumulate(y Int16x16, z Int32x8, u Mask32x8) Int32x8
+
+// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPWSSDS, CPU Feature: AVX512EVEX
+func (x Int32x8) MaskedSaturatedPairDotProdAccumulate(y Int16x16, z Int32x8, u Mask32x8) Int32x8
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
+func (x Int32x8) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8, u Mask32x8) Int32x8
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512EVEX
+func (x Int32x8) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8, u Mask32x8) Int32x8
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
+func (x Uint32x16) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16, u Mask32x16) Uint32x16
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512EVEX
+func (x Uint32x16) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16, u Mask32x16) Uint32x16
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
+func (x Uint32x4) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4, u Mask32x4) Uint32x4
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512EVEX
+func (x Uint32x4) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4, u Mask32x4) Uint32x4
+
+// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX
+func (x Uint32x8) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8, u Mask32x8) Uint32x8
+
+// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512EVEX
+func (x Uint32x8) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8, u Mask32x8) Uint32x8
+
 // CeilSuppressExceptionWithPrecision rounds elements up with specified precision, suppressing exceptions.
 // Const Immediate = 10.
 //