From: Junyang Shao Date: Thu, 12 Jun 2025 16:43:10 +0000 (+0000) Subject: [dev.simd] cmd/compile: add more dot products X-Git-Tag: go1.26rc1~147^2~239 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=ded6e0ac7140403480fa4539ed42ae8577eefbf9;p=gostls13.git [dev.simd] cmd/compile: add more dot products This CL is generated by CL 680215. Change-Id: Ie085e65e0473a8e96170702d7265d379ec8812ba Reviewed-on: https://go-review.googlesource.com/c/go/+/681298 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 02353c7f7b..7e9abbd3cb 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -679,6 +679,34 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPCMPBMasked512: p = simdFp2k1k1Imm8(s, v) + case ssa.OpAMD64VPDPWSSD128, + ssa.OpAMD64VPDPWSSD256, + ssa.OpAMD64VPDPWSSD512, + ssa.OpAMD64VPDPWSSDS128, + ssa.OpAMD64VPDPWSSDS256, + ssa.OpAMD64VPDPWSSDS512, + ssa.OpAMD64VPDPBUSDS128, + ssa.OpAMD64VPDPBUSDS256, + ssa.OpAMD64VPDPBUSDS512, + ssa.OpAMD64VPDPBUSD128, + ssa.OpAMD64VPDPBUSD256, + ssa.OpAMD64VPDPBUSD512: + p = simdFp31ResultInArg0(s, v) + + case ssa.OpAMD64VPDPWSSDMasked512, + ssa.OpAMD64VPDPWSSDMasked128, + ssa.OpAMD64VPDPWSSDMasked256, + ssa.OpAMD64VPDPWSSDSMasked512, + ssa.OpAMD64VPDPWSSDSMasked128, + ssa.OpAMD64VPDPWSSDSMasked256, + ssa.OpAMD64VPDPBUSDSMasked512, + ssa.OpAMD64VPDPBUSDSMasked128, + ssa.OpAMD64VPDPBUSDSMasked256, + ssa.OpAMD64VPDPBUSDMasked512, + ssa.OpAMD64VPDPBUSDMasked128, + ssa.OpAMD64VPDPBUSDMasked256: + p = simdFp3k1fp1ResultInArg0(s, v) + default: // Unknown reg shape return false @@ -884,6 +912,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMADDWDMasked256, ssa.OpAMD64VPMADDWDMasked512, ssa.OpAMD64VPMADDWDMasked128, + ssa.OpAMD64VPDPWSSDMasked512, + ssa.OpAMD64VPDPWSSDMasked128, + ssa.OpAMD64VPDPWSSDMasked256, ssa.OpAMD64VPOPCNTWMasked256, ssa.OpAMD64VPOPCNTWMasked512, ssa.OpAMD64VPOPCNTWMasked128, @@ -902,6 +933,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPADDSBMasked128, ssa.OpAMD64VPADDSBMasked256, ssa.OpAMD64VPADDSBMasked512, + ssa.OpAMD64VPDPWSSDSMasked512, + ssa.OpAMD64VPDPWSSDSMasked128, + ssa.OpAMD64VPDPWSSDSMasked256, ssa.OpAMD64VPSUBSWMasked256, ssa.OpAMD64VPSUBSWMasked512, ssa.OpAMD64VPSUBSWMasked128, @@ -911,6 +945,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMADDUBSWMasked256, ssa.OpAMD64VPMADDUBSWMasked512, ssa.OpAMD64VPMADDUBSWMasked128, + ssa.OpAMD64VPDPBUSDSMasked512, + ssa.OpAMD64VPDPBUSDSMasked128, + ssa.OpAMD64VPDPBUSDSMasked256, ssa.OpAMD64VSQRTPSMasked512, ssa.OpAMD64VSQRTPSMasked128, ssa.OpAMD64VSQRTPSMasked256, @@ -929,6 +966,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSUBBMasked128, ssa.OpAMD64VPSUBBMasked256, ssa.OpAMD64VPSUBBMasked512, + ssa.OpAMD64VPDPBUSDMasked512, + ssa.OpAMD64VPDPBUSDMasked128, + ssa.OpAMD64VPDPBUSDMasked256, ssa.OpAMD64VXORPSMasked512, ssa.OpAMD64VXORPSMasked128, ssa.OpAMD64VXORPSMasked256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index d5caf09dac..efee484b99 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -833,6 +833,9 @@ (MaskedPairDotProdInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM mask)) (MaskedPairDotProdInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM mask)) (MaskedPairDotProdInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM mask)) +(MaskedPairDotProdAccumulateInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM mask)) +(MaskedPairDotProdAccumulateInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM mask)) +(MaskedPairDotProdAccumulateInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM mask)) (MaskedPopCountInt16x16 x mask) => (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) (MaskedPopCountInt16x32 x mask) => (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) (MaskedPopCountInt16x8 x mask) => (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) @@ -881,6 +884,9 @@ (MaskedSaturatedAddUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) (MaskedSaturatedAddUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) (MaskedSaturatedAddUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) +(MaskedSaturatedPairDotProdAccumulateInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) +(MaskedSaturatedPairDotProdAccumulateInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) +(MaskedSaturatedPairDotProdAccumulateInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) (MaskedSaturatedSubInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) (MaskedSaturatedSubInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) (MaskedSaturatedSubInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) @@ -896,6 +902,12 @@ (MaskedSaturatedUnsignedSignedPairDotProdUint16x16 x y mask) => (VPMADDUBSWMasked256 x y (VPMOVVec16x16ToM mask)) (MaskedSaturatedUnsignedSignedPairDotProdUint16x32 x y mask) => (VPMADDUBSWMasked512 x y (VPMOVVec16x32ToM mask)) (MaskedSaturatedUnsignedSignedPairDotProdUint16x8 x y mask) => (VPMADDUBSWMasked128 x y (VPMOVVec16x8ToM mask)) +(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) +(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) +(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) +(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) +(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) +(MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) (MaskedSqrtFloat32x16 x mask) => (VSQRTPSMasked512 x (VPMOVVec32x16ToM mask)) (MaskedSqrtFloat32x4 x mask) => (VSQRTPSMasked128 x (VPMOVVec32x4ToM mask)) (MaskedSqrtFloat32x8 x mask) => (VSQRTPSMasked256 x (VPMOVVec32x8ToM mask)) @@ -944,6 +956,12 @@ (MaskedTruncWithPrecisionFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) (MaskedTruncWithPrecisionFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) (MaskedTruncWithPrecisionFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) +(MaskedUnsignedSignedQuadDotProdAccumulateInt32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) +(MaskedUnsignedSignedQuadDotProdAccumulateInt32x4 x y z mask) => (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) +(MaskedUnsignedSignedQuadDotProdAccumulateInt32x8 x y z mask) => (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) +(MaskedUnsignedSignedQuadDotProdAccumulateUint32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) +(MaskedUnsignedSignedQuadDotProdAccumulateUint32x4 x y z mask) => (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) +(MaskedUnsignedSignedQuadDotProdAccumulateUint32x8 x y z mask) => (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) (MaskedXorFloat32x16 x y mask) => (VXORPSMasked512 x y (VPMOVVec32x16ToM mask)) (MaskedXorFloat32x4 x y mask) => (VXORPSMasked128 x y (VPMOVVec32x4ToM mask)) (MaskedXorFloat32x8 x y mask) => (VXORPSMasked256 x y (VPMOVVec32x8ToM mask)) @@ -1118,6 +1136,9 @@ (PairDotProdInt16x16 ...) => (VPMADDWD256 ...) (PairDotProdInt16x32 ...) => (VPMADDWD512 ...) (PairDotProdInt16x8 ...) => (VPMADDWD128 ...) +(PairDotProdAccumulateInt32x16 ...) => (VPDPWSSD512 ...) +(PairDotProdAccumulateInt32x4 ...) => (VPDPWSSD128 ...) +(PairDotProdAccumulateInt32x8 ...) => (VPDPWSSD256 ...) (PairwiseAddFloat32x4 ...) => (VHADDPS128 ...) (PairwiseAddFloat32x8 ...) => (VHADDPS256 ...) (PairwiseAddFloat64x2 ...) => (VHADDPD128 ...) @@ -1194,6 +1215,9 @@ (SaturatedAddUint8x16 ...) => (VPADDSB128 ...) (SaturatedAddUint8x32 ...) => (VPADDSB256 ...) (SaturatedAddUint8x64 ...) => (VPADDSB512 ...) +(SaturatedPairDotProdAccumulateInt32x16 ...) => (VPDPWSSDS512 ...) +(SaturatedPairDotProdAccumulateInt32x4 ...) => (VPDPWSSDS128 ...) +(SaturatedPairDotProdAccumulateInt32x8 ...) => (VPDPWSSDS256 ...) (SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...) (SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...) (SaturatedPairwiseSubInt16x16 ...) => (VPHSUBSW256 ...) @@ -1215,6 +1239,12 @@ (SaturatedUnsignedSignedPairDotProdUint16x8 ...) => (VPMADDUBSW128 ...) (SaturatedUnsignedSignedPairDotProdUint8x16 ...) => (VPMADDUBSW128 ...) (SaturatedUnsignedSignedPairDotProdUint8x32 ...) => (VPMADDUBSW256 ...) +(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSDS512 ...) +(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSDS128 ...) +(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSDS256 ...) +(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 ...) => (VPDPBUSDS512 ...) +(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 ...) => (VPDPBUSDS128 ...) +(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 ...) => (VPDPBUSDS256 ...) (SignInt16x16 ...) => (VPSIGNW256 ...) (SignInt16x8 ...) => (VPSIGNW128 ...) (SignInt32x4 ...) => (VPSIGND128 ...) @@ -1273,6 +1303,12 @@ (TruncWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x) (TruncWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x) (TruncWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x) +(UnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSD512 ...) +(UnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSD128 ...) +(UnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSD256 ...) +(UnsignedSignedQuadDotProdAccumulateUint32x16 ...) => (VPDPBUSD512 ...) +(UnsignedSignedQuadDotProdAccumulateUint32x4 ...) => (VPDPBUSD128 ...) +(UnsignedSignedQuadDotProdAccumulateUint32x8 ...) => (VPDPBUSD256 ...) (XorFloat32x16 ...) => (VXORPS512 ...) (XorFloat32x4 ...) => (VXORPS128 ...) (XorFloat32x8 ...) => (VXORPS256 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index f580973c9d..6cc405c030 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -283,15 +283,23 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1 {name: "VPMINSDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMULLDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPORDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPORD", commutative: true, typ: "Vec512", resultInArg0: false}, + {name: "VPDPWSSDMasked512", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPOPCNTDMasked512", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPDPWSSDSMasked512", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true}, + {name: "VPDPBUSDSMasked512", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPSUBDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPSUBD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPDPBUSDMasked512", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPXORDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPXORD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSD512", argLength: 2, reg: fp21, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSD512", argLength: 2, reg: fp21, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMULLD512", argLength: 2, reg: fp21, asm: "VPMULLD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPORD512", argLength: 2, reg: fp21, asm: "VPORD", commutative: true, typ: "Vec512", resultInArg0: false}, + {name: "VPDPWSSD512", argLength: 3, reg: fp31, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPOPCNTD512", argLength: 1, reg: fp11, asm: "VPOPCNTD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPDPWSSDS512", argLength: 3, reg: fp31, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true}, + {name: "VPDPBUSDS512", argLength: 3, reg: fp31, asm: "VPDPBUSDS", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPSUBD512", argLength: 2, reg: fp21, asm: "VPSUBD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPDPBUSD512", argLength: 3, reg: fp31, asm: "VPDPBUSD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPXORD512", argLength: 2, reg: fp21, asm: "VPXORD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPABSD128", argLength: 1, reg: fp11, asm: "VPABSD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPADDD128", argLength: 2, reg: fp21, asm: "VPADDD", commutative: true, typ: "Vec128", resultInArg0: false}, @@ -307,18 +315,26 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1 {name: "VPMINSDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULLDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPORDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPORD", commutative: true, typ: "Vec128", resultInArg0: false}, + {name: "VPDPWSSDMasked128", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPOPCNTDMasked128", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPDPWSSDSMasked128", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPDPBUSDSMasked128", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPSUBDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPSUBD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPDPBUSDMasked128", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPXORDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPXORD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMAXSD128", argLength: 2, reg: fp21, asm: "VPMAXSD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMINSD128", argLength: 2, reg: fp21, asm: "VPMINSD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULDQ128", argLength: 2, reg: fp21, asm: "VPMULDQ", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULLD128", argLength: 2, reg: fp21, asm: "VPMULLD", commutative: true, typ: "Vec128", resultInArg0: false}, + {name: "VPDPWSSD128", argLength: 3, reg: fp31, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPHADDD128", argLength: 2, reg: fp21, asm: "VPHADDD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPHSUBD128", argLength: 2, reg: fp21, asm: "VPHSUBD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPOPCNTD128", argLength: 1, reg: fp11, asm: "VPOPCNTD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPDPWSSDS128", argLength: 3, reg: fp31, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPDPBUSDS128", argLength: 3, reg: fp31, asm: "VPDPBUSDS", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPSIGND128", argLength: 2, reg: fp21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSUBD128", argLength: 2, reg: fp21, asm: "VPSUBD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPDPBUSD128", argLength: 3, reg: fp31, asm: "VPDPBUSD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPABSD256", argLength: 1, reg: fp11, asm: "VPABSD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPADDD256", argLength: 2, reg: fp21, asm: "VPADDD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPCMPEQD256", argLength: 2, reg: fp21, asm: "VPCMPEQD", commutative: true, typ: "Vec256", resultInArg0: false}, @@ -333,18 +349,26 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1 {name: "VPMINSDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULLDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPORDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPORD", commutative: true, typ: "Vec256", resultInArg0: false}, + {name: "VPDPWSSDMasked256", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPOPCNTDMasked256", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPDPWSSDSMasked256", argLength: 4, reg: fp3k1fp1, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPDPBUSDSMasked256", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSUBDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPSUBD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPDPBUSDMasked256", argLength: 4, reg: fp3k1fp1, asm: "VPDPBUSD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPXORDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPXORD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMAXSD256", argLength: 2, reg: fp21, asm: "VPMAXSD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMINSD256", argLength: 2, reg: fp21, asm: "VPMINSD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULDQ256", argLength: 2, reg: fp21, asm: "VPMULDQ", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULLD256", argLength: 2, reg: fp21, asm: "VPMULLD", commutative: true, typ: "Vec256", resultInArg0: false}, + {name: "VPDPWSSD256", argLength: 3, reg: fp31, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPHADDD256", argLength: 2, reg: fp21, asm: "VPHADDD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPHSUBD256", argLength: 2, reg: fp21, asm: "VPHSUBD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPOPCNTD256", argLength: 1, reg: fp11, asm: "VPOPCNTD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPDPWSSDS256", argLength: 3, reg: fp31, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPDPBUSDS256", argLength: 3, reg: fp31, asm: "VPDPBUSDS", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSIGND256", argLength: 2, reg: fp21, asm: "VPSIGND", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSUBD256", argLength: 2, reg: fp21, asm: "VPSUBD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPDPBUSD256", argLength: 3, reg: fp31, asm: "VPDPBUSD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPABSQ128", argLength: 1, reg: fp11, asm: "VPABSQ", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPADDQ128", argLength: 2, reg: fp21, asm: "VPADDQ", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPCMPEQQ128", argLength: 2, reg: fp21, asm: "VPCMPEQQ", commutative: true, typ: "Vec128", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 3e3411e0df..404f1fc69f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -427,16 +427,24 @@ func simdGenericOps() []opData { {name: "MaskedMulLowInt32x16", argLength: 3, commutative: true}, {name: "MaskedNotEqualInt32x16", argLength: 3, commutative: true}, {name: "MaskedOrInt32x16", argLength: 3, commutative: true}, + {name: "MaskedPairDotProdAccumulateInt32x16", argLength: 4, commutative: false}, {name: "MaskedPopCountInt32x16", argLength: 2, commutative: false}, + {name: "MaskedSaturatedPairDotProdAccumulateInt32x16", argLength: 4, commutative: false}, + {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 4, commutative: false}, {name: "MaskedSubInt32x16", argLength: 3, commutative: false}, + {name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 4, commutative: false}, {name: "MaskedXorInt32x16", argLength: 3, commutative: true}, {name: "MaxInt32x16", argLength: 2, commutative: true}, {name: "MinInt32x16", argLength: 2, commutative: true}, {name: "MulLowInt32x16", argLength: 2, commutative: true}, {name: "NotEqualInt32x16", argLength: 2, commutative: true}, {name: "OrInt32x16", argLength: 2, commutative: true}, + {name: "PairDotProdAccumulateInt32x16", argLength: 3, commutative: false}, {name: "PopCountInt32x16", argLength: 1, commutative: false}, + {name: "SaturatedPairDotProdAccumulateInt32x16", argLength: 3, commutative: false}, + {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 3, commutative: false}, {name: "SubInt32x16", argLength: 2, commutative: false}, + {name: "UnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 3, commutative: false}, {name: "XorInt32x16", argLength: 2, commutative: true}, {name: "AbsoluteInt32x4", argLength: 1, commutative: false}, {name: "AddInt32x4", argLength: 2, commutative: true}, @@ -461,8 +469,12 @@ func simdGenericOps() []opData { {name: "MaskedMulLowInt32x4", argLength: 3, commutative: true}, {name: "MaskedNotEqualInt32x4", argLength: 3, commutative: true}, {name: "MaskedOrInt32x4", argLength: 3, commutative: true}, + {name: "MaskedPairDotProdAccumulateInt32x4", argLength: 4, commutative: false}, {name: "MaskedPopCountInt32x4", argLength: 2, commutative: false}, + {name: "MaskedSaturatedPairDotProdAccumulateInt32x4", argLength: 4, commutative: false}, + {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 4, commutative: false}, {name: "MaskedSubInt32x4", argLength: 3, commutative: false}, + {name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 4, commutative: false}, {name: "MaskedXorInt32x4", argLength: 3, commutative: true}, {name: "MaxInt32x4", argLength: 2, commutative: true}, {name: "MinInt32x4", argLength: 2, commutative: true}, @@ -470,11 +482,15 @@ func simdGenericOps() []opData { {name: "MulLowInt32x4", argLength: 2, commutative: true}, {name: "NotEqualInt32x4", argLength: 2, commutative: true}, {name: "OrInt32x4", argLength: 2, commutative: true}, + {name: "PairDotProdAccumulateInt32x4", argLength: 3, commutative: false}, {name: "PairwiseAddInt32x4", argLength: 2, commutative: false}, {name: "PairwiseSubInt32x4", argLength: 2, commutative: false}, {name: "PopCountInt32x4", argLength: 1, commutative: false}, + {name: "SaturatedPairDotProdAccumulateInt32x4", argLength: 3, commutative: false}, + {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 3, commutative: false}, {name: "SignInt32x4", argLength: 2, commutative: false}, {name: "SubInt32x4", argLength: 2, commutative: false}, + {name: "UnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 3, commutative: false}, {name: "XorInt32x4", argLength: 2, commutative: true}, {name: "AbsoluteInt32x8", argLength: 1, commutative: false}, {name: "AddInt32x8", argLength: 2, commutative: true}, @@ -499,8 +515,12 @@ func simdGenericOps() []opData { {name: "MaskedMulLowInt32x8", argLength: 3, commutative: true}, {name: "MaskedNotEqualInt32x8", argLength: 3, commutative: true}, {name: "MaskedOrInt32x8", argLength: 3, commutative: true}, + {name: "MaskedPairDotProdAccumulateInt32x8", argLength: 4, commutative: false}, {name: "MaskedPopCountInt32x8", argLength: 2, commutative: false}, + {name: "MaskedSaturatedPairDotProdAccumulateInt32x8", argLength: 4, commutative: false}, + {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 4, commutative: false}, {name: "MaskedSubInt32x8", argLength: 3, commutative: false}, + {name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 4, commutative: false}, {name: "MaskedXorInt32x8", argLength: 3, commutative: true}, {name: "MaxInt32x8", argLength: 2, commutative: true}, {name: "MinInt32x8", argLength: 2, commutative: true}, @@ -508,11 +528,15 @@ func simdGenericOps() []opData { {name: "MulLowInt32x8", argLength: 2, commutative: true}, {name: "NotEqualInt32x8", argLength: 2, commutative: true}, {name: "OrInt32x8", argLength: 2, commutative: true}, + {name: "PairDotProdAccumulateInt32x8", argLength: 3, commutative: false}, {name: "PairwiseAddInt32x8", argLength: 2, commutative: false}, {name: "PairwiseSubInt32x8", argLength: 2, commutative: false}, {name: "PopCountInt32x8", argLength: 1, commutative: false}, + {name: "SaturatedPairDotProdAccumulateInt32x8", argLength: 3, commutative: false}, + {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 3, commutative: false}, {name: "SignInt32x8", argLength: 2, commutative: false}, {name: "SubInt32x8", argLength: 2, commutative: false}, + {name: "UnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 3, commutative: false}, {name: "XorInt32x8", argLength: 2, commutative: true}, {name: "AbsoluteInt64x2", argLength: 1, commutative: false}, {name: "AddInt64x2", argLength: 2, commutative: true}, @@ -845,14 +869,18 @@ func simdGenericOps() []opData { {name: "MaskedNotEqualUint32x16", argLength: 3, commutative: true}, {name: "MaskedOrUint32x16", argLength: 3, commutative: true}, {name: "MaskedPopCountUint32x16", argLength: 2, commutative: false}, + {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 4, commutative: false}, {name: "MaskedSubUint32x16", argLength: 3, commutative: false}, + {name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 4, commutative: false}, {name: "MaskedXorUint32x16", argLength: 3, commutative: true}, {name: "MaxUint32x16", argLength: 2, commutative: true}, {name: "MinUint32x16", argLength: 2, commutative: true}, {name: "NotEqualUint32x16", argLength: 2, commutative: true}, {name: "OrUint32x16", argLength: 2, commutative: true}, {name: "PopCountUint32x16", argLength: 1, commutative: false}, + {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 3, commutative: false}, {name: "SubUint32x16", argLength: 2, commutative: false}, + {name: "UnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 3, commutative: false}, {name: "XorUint32x16", argLength: 2, commutative: true}, {name: "AddUint32x4", argLength: 2, commutative: true}, {name: "AndUint32x4", argLength: 2, commutative: true}, @@ -875,7 +903,9 @@ func simdGenericOps() []opData { {name: "MaskedNotEqualUint32x4", argLength: 3, commutative: true}, {name: "MaskedOrUint32x4", argLength: 3, commutative: true}, {name: "MaskedPopCountUint32x4", argLength: 2, commutative: false}, + {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 4, commutative: false}, {name: "MaskedSubUint32x4", argLength: 3, commutative: false}, + {name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 4, commutative: false}, {name: "MaskedXorUint32x4", argLength: 3, commutative: true}, {name: "MaxUint32x4", argLength: 2, commutative: true}, {name: "MinUint32x4", argLength: 2, commutative: true}, @@ -885,7 +915,9 @@ func simdGenericOps() []opData { {name: "PairwiseAddUint32x4", argLength: 2, commutative: false}, {name: "PairwiseSubUint32x4", argLength: 2, commutative: false}, {name: "PopCountUint32x4", argLength: 1, commutative: false}, + {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 3, commutative: false}, {name: "SubUint32x4", argLength: 2, commutative: false}, + {name: "UnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 3, commutative: false}, {name: "XorUint32x4", argLength: 2, commutative: true}, {name: "AddUint32x8", argLength: 2, commutative: true}, {name: "AndUint32x8", argLength: 2, commutative: true}, @@ -908,7 +940,9 @@ func simdGenericOps() []opData { {name: "MaskedNotEqualUint32x8", argLength: 3, commutative: true}, {name: "MaskedOrUint32x8", argLength: 3, commutative: true}, {name: "MaskedPopCountUint32x8", argLength: 2, commutative: false}, + {name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 4, commutative: false}, {name: "MaskedSubUint32x8", argLength: 3, commutative: false}, + {name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 4, commutative: false}, {name: "MaskedXorUint32x8", argLength: 3, commutative: true}, {name: "MaxUint32x8", argLength: 2, commutative: true}, {name: "MinUint32x8", argLength: 2, commutative: true}, @@ -918,7 +952,9 @@ func simdGenericOps() []opData { {name: "PairwiseAddUint32x8", argLength: 2, commutative: false}, {name: "PairwiseSubUint32x8", argLength: 2, commutative: false}, {name: "PopCountUint32x8", argLength: 1, commutative: false}, + {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 3, commutative: false}, {name: "SubUint32x8", argLength: 2, commutative: false}, + {name: "UnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 3, commutative: false}, {name: "XorUint32x8", argLength: 2, commutative: true}, {name: "AddUint64x2", argLength: 2, commutative: true}, {name: "AndUint64x2", argLength: 2, commutative: true}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 3ef08ae555..26facad933 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1476,15 +1476,23 @@ const ( OpAMD64VPMINSDMasked512 OpAMD64VPMULLDMasked512 OpAMD64VPORDMasked512 + OpAMD64VPDPWSSDMasked512 OpAMD64VPOPCNTDMasked512 + OpAMD64VPDPWSSDSMasked512 + OpAMD64VPDPBUSDSMasked512 OpAMD64VPSUBDMasked512 + OpAMD64VPDPBUSDMasked512 OpAMD64VPXORDMasked512 OpAMD64VPMAXSD512 OpAMD64VPMINSD512 OpAMD64VPMULLD512 OpAMD64VPORD512 + OpAMD64VPDPWSSD512 OpAMD64VPOPCNTD512 + OpAMD64VPDPWSSDS512 + OpAMD64VPDPBUSDS512 OpAMD64VPSUBD512 + OpAMD64VPDPBUSD512 OpAMD64VPXORD512 OpAMD64VPABSD128 OpAMD64VPADDD128 @@ -1500,18 +1508,26 @@ const ( OpAMD64VPMINSDMasked128 OpAMD64VPMULLDMasked128 OpAMD64VPORDMasked128 + OpAMD64VPDPWSSDMasked128 OpAMD64VPOPCNTDMasked128 + OpAMD64VPDPWSSDSMasked128 + OpAMD64VPDPBUSDSMasked128 OpAMD64VPSUBDMasked128 + OpAMD64VPDPBUSDMasked128 OpAMD64VPXORDMasked128 OpAMD64VPMAXSD128 OpAMD64VPMINSD128 OpAMD64VPMULDQ128 OpAMD64VPMULLD128 + OpAMD64VPDPWSSD128 OpAMD64VPHADDD128 OpAMD64VPHSUBD128 OpAMD64VPOPCNTD128 + OpAMD64VPDPWSSDS128 + OpAMD64VPDPBUSDS128 OpAMD64VPSIGND128 OpAMD64VPSUBD128 + OpAMD64VPDPBUSD128 OpAMD64VPABSD256 OpAMD64VPADDD256 OpAMD64VPCMPEQD256 @@ -1526,18 +1542,26 @@ const ( OpAMD64VPMINSDMasked256 OpAMD64VPMULLDMasked256 OpAMD64VPORDMasked256 + OpAMD64VPDPWSSDMasked256 OpAMD64VPOPCNTDMasked256 + OpAMD64VPDPWSSDSMasked256 + OpAMD64VPDPBUSDSMasked256 OpAMD64VPSUBDMasked256 + OpAMD64VPDPBUSDMasked256 OpAMD64VPXORDMasked256 OpAMD64VPMAXSD256 OpAMD64VPMINSD256 OpAMD64VPMULDQ256 OpAMD64VPMULLD256 + OpAMD64VPDPWSSD256 OpAMD64VPHADDD256 OpAMD64VPHSUBD256 OpAMD64VPOPCNTD256 + OpAMD64VPDPWSSDS256 + OpAMD64VPDPBUSDS256 OpAMD64VPSIGND256 OpAMD64VPSUBD256 + OpAMD64VPDPBUSD256 OpAMD64VPABSQ128 OpAMD64VPADDQ128 OpAMD64VPCMPEQQ128 @@ -4491,16 +4515,24 @@ const ( OpMaskedMulLowInt32x16 OpMaskedNotEqualInt32x16 OpMaskedOrInt32x16 + OpMaskedPairDotProdAccumulateInt32x16 OpMaskedPopCountInt32x16 + OpMaskedSaturatedPairDotProdAccumulateInt32x16 + OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 OpMaskedSubInt32x16 + OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16 OpMaskedXorInt32x16 OpMaxInt32x16 OpMinInt32x16 OpMulLowInt32x16 OpNotEqualInt32x16 OpOrInt32x16 + OpPairDotProdAccumulateInt32x16 OpPopCountInt32x16 + OpSaturatedPairDotProdAccumulateInt32x16 + OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 OpSubInt32x16 + OpUnsignedSignedQuadDotProdAccumulateInt32x16 OpXorInt32x16 OpAbsoluteInt32x4 OpAddInt32x4 @@ -4525,8 +4557,12 @@ const ( OpMaskedMulLowInt32x4 OpMaskedNotEqualInt32x4 OpMaskedOrInt32x4 + OpMaskedPairDotProdAccumulateInt32x4 OpMaskedPopCountInt32x4 + OpMaskedSaturatedPairDotProdAccumulateInt32x4 + OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 OpMaskedSubInt32x4 + OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4 OpMaskedXorInt32x4 OpMaxInt32x4 OpMinInt32x4 @@ -4534,11 +4570,15 @@ const ( OpMulLowInt32x4 OpNotEqualInt32x4 OpOrInt32x4 + OpPairDotProdAccumulateInt32x4 OpPairwiseAddInt32x4 OpPairwiseSubInt32x4 OpPopCountInt32x4 + OpSaturatedPairDotProdAccumulateInt32x4 + OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 OpSignInt32x4 OpSubInt32x4 + OpUnsignedSignedQuadDotProdAccumulateInt32x4 OpXorInt32x4 OpAbsoluteInt32x8 OpAddInt32x8 @@ -4563,8 +4603,12 @@ const ( OpMaskedMulLowInt32x8 OpMaskedNotEqualInt32x8 OpMaskedOrInt32x8 + OpMaskedPairDotProdAccumulateInt32x8 OpMaskedPopCountInt32x8 + OpMaskedSaturatedPairDotProdAccumulateInt32x8 + OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 OpMaskedSubInt32x8 + OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8 OpMaskedXorInt32x8 OpMaxInt32x8 OpMinInt32x8 @@ -4572,11 +4616,15 @@ const ( OpMulLowInt32x8 OpNotEqualInt32x8 OpOrInt32x8 + OpPairDotProdAccumulateInt32x8 OpPairwiseAddInt32x8 OpPairwiseSubInt32x8 OpPopCountInt32x8 + OpSaturatedPairDotProdAccumulateInt32x8 + OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 OpSignInt32x8 OpSubInt32x8 + OpUnsignedSignedQuadDotProdAccumulateInt32x8 OpXorInt32x8 OpAbsoluteInt64x2 OpAddInt64x2 @@ -4909,14 +4957,18 @@ const ( OpMaskedNotEqualUint32x16 OpMaskedOrUint32x16 OpMaskedPopCountUint32x16 + OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 OpMaskedSubUint32x16 + OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16 OpMaskedXorUint32x16 OpMaxUint32x16 OpMinUint32x16 OpNotEqualUint32x16 OpOrUint32x16 OpPopCountUint32x16 + OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 OpSubUint32x16 + OpUnsignedSignedQuadDotProdAccumulateUint32x16 OpXorUint32x16 OpAddUint32x4 OpAndUint32x4 @@ -4939,7 +4991,9 @@ const ( OpMaskedNotEqualUint32x4 OpMaskedOrUint32x4 OpMaskedPopCountUint32x4 + OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 OpMaskedSubUint32x4 + OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4 OpMaskedXorUint32x4 OpMaxUint32x4 OpMinUint32x4 @@ -4949,7 +5003,9 @@ const ( OpPairwiseAddUint32x4 OpPairwiseSubUint32x4 OpPopCountUint32x4 + OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 OpSubUint32x4 + OpUnsignedSignedQuadDotProdAccumulateUint32x4 OpXorUint32x4 OpAddUint32x8 OpAndUint32x8 @@ -4972,7 +5028,9 @@ const ( OpMaskedNotEqualUint32x8 OpMaskedOrUint32x8 OpMaskedPopCountUint32x8 + OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 OpMaskedSubUint32x8 + OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8 OpMaskedXorUint32x8 OpMaxUint32x8 OpMinUint32x8 @@ -4982,7 +5040,9 @@ const ( OpPairwiseAddUint32x8 OpPairwiseSubUint32x8 OpPopCountUint32x8 + OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 OpSubUint32x8 + OpUnsignedSignedQuadDotProdAccumulateUint32x8 OpXorUint32x8 OpAddUint64x2 OpAndUint64x2 @@ -22116,6 +22176,23 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDMasked512", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPWSSD, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPOPCNTDMasked512", argLen: 2, @@ -22130,6 +22207,40 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDSMasked512", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPWSSDS, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPDPBUSDSMasked512", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPBUSDS, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSUBDMasked512", argLen: 3, @@ -22145,6 +22256,23 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPBUSDMasked512", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPBUSD, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPXORDMasked512", argLen: 3, @@ -22221,6 +22349,22 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSD512", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPWSSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPOPCNTD512", argLen: 1, @@ -22234,6 +22378,38 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDS512", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPWSSDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPDPBUSDS512", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPBUSDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSUBD512", argLen: 2, @@ -22248,6 +22424,22 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPBUSD512", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPBUSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPXORD512", argLen: 2, @@ -22477,6 +22669,23 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDMasked128", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPWSSD, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPOPCNTDMasked128", argLen: 2, @@ -22491,6 +22700,40 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDSMasked128", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPWSSDS, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPDPBUSDSMasked128", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPBUSDS, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSUBDMasked128", argLen: 3, @@ -22506,6 +22749,23 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPBUSDMasked128", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPBUSD, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPXORDMasked128", argLen: 3, @@ -22582,6 +22842,22 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSD128", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPWSSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPHADDD128", argLen: 2, @@ -22623,6 +22899,38 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDS128", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPWSSDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPDPBUSDS128", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPBUSDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSIGND128", argLen: 2, @@ -22651,6 +22959,22 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPBUSD128", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPBUSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPABSD256", argLen: 1, @@ -22865,6 +23189,23 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDMasked256", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPWSSD, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPOPCNTDMasked256", argLen: 2, @@ -22879,6 +23220,40 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDSMasked256", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPWSSDS, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPDPBUSDSMasked256", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPBUSDS, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSUBDMasked256", argLen: 3, @@ -22894,6 +23269,23 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPBUSDMasked256", + argLen: 4, + resultInArg0: true, + asm: x86.AVPDPBUSD, + reg: regInfo{ + inputs: []inputInfo{ + {3, 1090921693184}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPXORDMasked256", argLen: 3, @@ -22970,6 +23362,22 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSD256", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPWSSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPHADDD256", argLen: 2, @@ -23011,6 +23419,38 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPWSSDS256", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPWSSDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPDPBUSDS256", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPBUSDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSIGND256", argLen: 2, @@ -23039,6 +23479,22 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPDPBUSD256", + argLen: 3, + resultInArg0: true, + asm: x86.AVPDPBUSD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPABSQ128", argLen: 1, @@ -57134,16 +57590,36 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "MaskedPairDotProdAccumulateInt32x16", + argLen: 4, + generic: true, + }, { name: "MaskedPopCountInt32x16", argLen: 2, generic: true, }, + { + name: "MaskedSaturatedPairDotProdAccumulateInt32x16", + argLen: 4, + generic: true, + }, + { + name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16", + argLen: 4, + generic: true, + }, { name: "MaskedSubInt32x16", argLen: 3, generic: true, }, + { + name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x16", + argLen: 4, + generic: true, + }, { name: "MaskedXorInt32x16", argLen: 3, @@ -57180,16 +57656,36 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "PairDotProdAccumulateInt32x16", + argLen: 3, + generic: true, + }, { name: "PopCountInt32x16", argLen: 1, generic: true, }, + { + name: "SaturatedPairDotProdAccumulateInt32x16", + argLen: 3, + generic: true, + }, + { + name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16", + argLen: 3, + generic: true, + }, { name: "SubInt32x16", argLen: 2, generic: true, }, + { + name: "UnsignedSignedQuadDotProdAccumulateInt32x16", + argLen: 3, + generic: true, + }, { name: "XorInt32x16", argLen: 2, @@ -57324,16 +57820,36 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "MaskedPairDotProdAccumulateInt32x4", + argLen: 4, + generic: true, + }, { name: "MaskedPopCountInt32x4", argLen: 2, generic: true, }, + { + name: "MaskedSaturatedPairDotProdAccumulateInt32x4", + argLen: 4, + generic: true, + }, + { + name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4", + argLen: 4, + generic: true, + }, { name: "MaskedSubInt32x4", argLen: 3, generic: true, }, + { + name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x4", + argLen: 4, + generic: true, + }, { name: "MaskedXorInt32x4", argLen: 3, @@ -57376,6 +57892,11 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "PairDotProdAccumulateInt32x4", + argLen: 3, + generic: true, + }, { name: "PairwiseAddInt32x4", argLen: 2, @@ -57391,6 +57912,16 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "SaturatedPairDotProdAccumulateInt32x4", + argLen: 3, + generic: true, + }, + { + name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4", + argLen: 3, + generic: true, + }, { name: "SignInt32x4", argLen: 2, @@ -57401,6 +57932,11 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "UnsignedSignedQuadDotProdAccumulateInt32x4", + argLen: 3, + generic: true, + }, { name: "XorInt32x4", argLen: 2, @@ -57535,16 +58071,36 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "MaskedPairDotProdAccumulateInt32x8", + argLen: 4, + generic: true, + }, { name: "MaskedPopCountInt32x8", argLen: 2, generic: true, }, + { + name: "MaskedSaturatedPairDotProdAccumulateInt32x8", + argLen: 4, + generic: true, + }, + { + name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8", + argLen: 4, + generic: true, + }, { name: "MaskedSubInt32x8", argLen: 3, generic: true, }, + { + name: "MaskedUnsignedSignedQuadDotProdAccumulateInt32x8", + argLen: 4, + generic: true, + }, { name: "MaskedXorInt32x8", argLen: 3, @@ -57587,6 +58143,11 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "PairDotProdAccumulateInt32x8", + argLen: 3, + generic: true, + }, { name: "PairwiseAddInt32x8", argLen: 2, @@ -57602,6 +58163,16 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "SaturatedPairDotProdAccumulateInt32x8", + argLen: 3, + generic: true, + }, + { + name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8", + argLen: 3, + generic: true, + }, { name: "SignInt32x8", argLen: 2, @@ -57612,6 +58183,11 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "UnsignedSignedQuadDotProdAccumulateInt32x8", + argLen: 3, + generic: true, + }, { name: "XorInt32x8", argLen: 2, @@ -59451,11 +60027,21 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16", + argLen: 4, + generic: true, + }, { name: "MaskedSubUint32x16", argLen: 3, generic: true, }, + { + name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x16", + argLen: 4, + generic: true, + }, { name: "MaskedXorUint32x16", argLen: 3, @@ -59491,11 +60077,21 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16", + argLen: 3, + generic: true, + }, { name: "SubUint32x16", argLen: 2, generic: true, }, + { + name: "UnsignedSignedQuadDotProdAccumulateUint32x16", + argLen: 3, + generic: true, + }, { name: "XorUint32x16", argLen: 2, @@ -59619,11 +60215,21 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4", + argLen: 4, + generic: true, + }, { name: "MaskedSubUint32x4", argLen: 3, generic: true, }, + { + name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x4", + argLen: 4, + generic: true, + }, { name: "MaskedXorUint32x4", argLen: 3, @@ -59675,11 +60281,21 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4", + argLen: 3, + generic: true, + }, { name: "SubUint32x4", argLen: 2, generic: true, }, + { + name: "UnsignedSignedQuadDotProdAccumulateUint32x4", + argLen: 3, + generic: true, + }, { name: "XorUint32x4", argLen: 2, @@ -59803,11 +60419,21 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8", + argLen: 4, + generic: true, + }, { name: "MaskedSubUint32x8", argLen: 3, generic: true, }, + { + name: "MaskedUnsignedSignedQuadDotProdAccumulateUint32x8", + argLen: 4, + generic: true, + }, { name: "MaskedXorUint32x8", argLen: 3, @@ -59859,11 +60485,21 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8", + argLen: 3, + generic: true, + }, { name: "SubUint32x8", argLen: 2, generic: true, }, + { + name: "UnsignedSignedQuadDotProdAccumulateUint32x8", + argLen: 3, + generic: true, + }, { name: "XorUint32x8", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 3605e75213..60469f49d9 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2696,6 +2696,12 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpMaskedOrUint64x4(v) case OpMaskedOrUint64x8: return rewriteValueAMD64_OpMaskedOrUint64x8(v) + case OpMaskedPairDotProdAccumulateInt32x16: + return rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x16(v) + case OpMaskedPairDotProdAccumulateInt32x4: + return rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x4(v) + case OpMaskedPairDotProdAccumulateInt32x8: + return rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x8(v) case OpMaskedPairDotProdInt16x16: return rewriteValueAMD64_OpMaskedPairDotProdInt16x16(v) case OpMaskedPairDotProdInt16x32: @@ -2798,6 +2804,12 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpMaskedSaturatedAddUint8x32(v) case OpMaskedSaturatedAddUint8x64: return rewriteValueAMD64_OpMaskedSaturatedAddUint8x64(v) + case OpMaskedSaturatedPairDotProdAccumulateInt32x16: + return rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x16(v) + case OpMaskedSaturatedPairDotProdAccumulateInt32x4: + return rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x4(v) + case OpMaskedSaturatedPairDotProdAccumulateInt32x8: + return rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x8(v) case OpMaskedSaturatedSubInt16x16: return rewriteValueAMD64_OpMaskedSaturatedSubInt16x16(v) case OpMaskedSaturatedSubInt16x32: @@ -2828,6 +2840,18 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedPairDotProdUint16x32(v) case OpMaskedSaturatedUnsignedSignedPairDotProdUint16x8: return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedPairDotProdUint16x8(v) + case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16: + return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16(v) + case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4: + return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4(v) + case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8: + return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8(v) + case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16: + return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16(v) + case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4: + return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4(v) + case OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8: + return rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8(v) case OpMaskedSqrtFloat32x16: return rewriteValueAMD64_OpMaskedSqrtFloat32x16(v) case OpMaskedSqrtFloat32x4: @@ -2924,6 +2948,18 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x4(v) case OpMaskedTruncWithPrecisionFloat64x8: return rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x8(v) + case OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16: + return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16(v) + case OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4: + return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4(v) + case OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8: + return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8(v) + case OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16: + return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16(v) + case OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4: + return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4(v) + case OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8: + return rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8(v) case OpMaskedXorFloat32x16: return rewriteValueAMD64_OpMaskedXorFloat32x16(v) case OpMaskedXorFloat32x4: @@ -3490,6 +3526,15 @@ func rewriteValueAMD64(v *Value) bool { case OpOrUint8x32: v.Op = OpAMD64VPOR256 return true + case OpPairDotProdAccumulateInt32x16: + v.Op = OpAMD64VPDPWSSD512 + return true + case OpPairDotProdAccumulateInt32x4: + v.Op = OpAMD64VPDPWSSD128 + return true + case OpPairDotProdAccumulateInt32x8: + v.Op = OpAMD64VPDPWSSD256 + return true case OpPairDotProdInt16x16: v.Op = OpAMD64VPMADDWD256 return true @@ -3813,6 +3858,15 @@ func rewriteValueAMD64(v *Value) bool { case OpSaturatedAddUint8x64: v.Op = OpAMD64VPADDSB512 return true + case OpSaturatedPairDotProdAccumulateInt32x16: + v.Op = OpAMD64VPDPWSSDS512 + return true + case OpSaturatedPairDotProdAccumulateInt32x4: + v.Op = OpAMD64VPDPWSSDS128 + return true + case OpSaturatedPairDotProdAccumulateInt32x8: + v.Op = OpAMD64VPDPWSSDS256 + return true case OpSaturatedPairwiseAddInt16x16: v.Op = OpAMD64VPHADDSW256 return true @@ -3876,6 +3930,24 @@ func rewriteValueAMD64(v *Value) bool { case OpSaturatedUnsignedSignedPairDotProdUint8x32: v.Op = OpAMD64VPMADDUBSW256 return true + case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16: + v.Op = OpAMD64VPDPBUSDS512 + return true + case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4: + v.Op = OpAMD64VPDPBUSDS128 + return true + case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8: + v.Op = OpAMD64VPDPBUSDS256 + return true + case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16: + v.Op = OpAMD64VPDPBUSDS512 + return true + case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4: + v.Op = OpAMD64VPDPBUSDS128 + return true + case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8: + v.Op = OpAMD64VPDPBUSDS256 + return true case OpSelect0: return rewriteValueAMD64_OpSelect0(v) case OpSelect1: @@ -4119,6 +4191,24 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v) case OpTruncWithPrecisionFloat64x8: return rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v) + case OpUnsignedSignedQuadDotProdAccumulateInt32x16: + v.Op = OpAMD64VPDPBUSD512 + return true + case OpUnsignedSignedQuadDotProdAccumulateInt32x4: + v.Op = OpAMD64VPDPBUSD128 + return true + case OpUnsignedSignedQuadDotProdAccumulateInt32x8: + v.Op = OpAMD64VPDPBUSD256 + return true + case OpUnsignedSignedQuadDotProdAccumulateUint32x16: + v.Op = OpAMD64VPDPBUSD512 + return true + case OpUnsignedSignedQuadDotProdAccumulateUint32x4: + v.Op = OpAMD64VPDPBUSD128 + return true + case OpUnsignedSignedQuadDotProdAccumulateUint32x8: + v.Op = OpAMD64VPDPBUSD256 + return true case OpWB: v.Op = OpAMD64LoweredWB return true @@ -42772,6 +42862,66 @@ func rewriteValueAMD64_OpMaskedOrUint64x8(v *Value) bool { return true } } +func rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedPairDotProdAccumulateInt32x16 x y z mask) + // result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedPairDotProdAccumulateInt32x4 x y z mask) + // result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedPairDotProdAccumulateInt32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedPairDotProdAccumulateInt32x8 x y z mask) + // result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} func rewriteValueAMD64_OpMaskedPairDotProdInt16x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -43642,6 +43792,66 @@ func rewriteValueAMD64_OpMaskedSaturatedAddUint8x64(v *Value) bool { return true } } +func rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedPairDotProdAccumulateInt32x16 x y z mask) + // result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedPairDotProdAccumulateInt32x4 x y z mask) + // result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedSaturatedPairDotProdAccumulateInt32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedPairDotProdAccumulateInt32x8 x y z mask) + // result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} func rewriteValueAMD64_OpMaskedSaturatedSubInt16x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -43912,6 +44122,126 @@ func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedPairDotProdUint16x8(v *Val return true } } +func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 x y z mask) + // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 x y z mask) + // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 x y z mask) + // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 x y z mask) + // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 x y z mask) + // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 x y z mask) + // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} func rewriteValueAMD64_OpMaskedSqrtFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -44764,6 +45094,126 @@ func rewriteValueAMD64_OpMaskedTruncWithPrecisionFloat64x8(v *Value) bool { return true } } +func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedUnsignedSignedQuadDotProdAccumulateInt32x16 x y z mask) + // result: (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedUnsignedSignedQuadDotProdAccumulateInt32x4 x y z mask) + // result: (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedUnsignedSignedQuadDotProdAccumulateInt32x8 x y z mask) + // result: (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedUnsignedSignedQuadDotProdAccumulateUint32x16 x y z mask) + // result: (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedUnsignedSignedQuadDotProdAccumulateUint32x4 x y z mask) + // result: (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MaskedUnsignedSignedQuadDotProdAccumulateUint32x8 x y z mask) + // result: (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} func rewriteValueAMD64_OpMaskedXorFloat32x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 7ac5f74246..b7b80a7063 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -833,6 +833,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int32x16.MaskedOr", opLen3(ssa.OpMaskedOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x16.MaskedSub", opLen3(ssa.OpMaskedSubInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x16.MaskedXor", opLen3(ssa.OpMaskedXorInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.MaskedAdd", opLen3(ssa.OpMaskedAddInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x4.MaskedAnd", opLen3(ssa.OpMaskedAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x4.MaskedAndNot", opLen3(ssa.OpMaskedAndNotInt32x4, types.TypeVec128), sys.AMD64) @@ -848,6 +852,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int32x4.MaskedOr", opLen3(ssa.OpMaskedOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x4.MaskedSub", opLen3(ssa.OpMaskedSubInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x4.MaskedXor", opLen3(ssa.OpMaskedXorInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.MaskedAdd", opLen3(ssa.OpMaskedAddInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x8.MaskedAnd", opLen3(ssa.OpMaskedAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x8.MaskedAndNot", opLen3(ssa.OpMaskedAndNotInt32x8, types.TypeVec256), sys.AMD64) @@ -863,6 +871,10 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int32x8.MaskedOr", opLen3(ssa.OpMaskedOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x8.MaskedSub", opLen3(ssa.OpMaskedSubInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x8.MaskedXor", opLen3(ssa.OpMaskedXorInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int64x2.MaskedAdd", opLen3(ssa.OpMaskedAddInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x2.MaskedAnd", opLen3(ssa.OpMaskedAndInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x2.MaskedAndNot", opLen3(ssa.OpMaskedAndNotInt64x2, types.TypeVec128), sys.AMD64) @@ -1006,6 +1018,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint32x16.MaskedOr", opLen3(ssa.OpMaskedOrUint32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x16.MaskedSub", opLen3(ssa.OpMaskedSubUint32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x16.MaskedXor", opLen3(ssa.OpMaskedXorUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x16.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x4.MaskedAdd", opLen3(ssa.OpMaskedAddUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x4.MaskedAnd", opLen3(ssa.OpMaskedAndUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x4.MaskedAndNot", opLen3(ssa.OpMaskedAndNotUint32x4, types.TypeVec128), sys.AMD64) @@ -1020,6 +1034,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint32x4.MaskedOr", opLen3(ssa.OpMaskedOrUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x4.MaskedSub", opLen3(ssa.OpMaskedSubUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x4.MaskedXor", opLen3(ssa.OpMaskedXorUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.MaskedAdd", opLen3(ssa.OpMaskedAddUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x8.MaskedAnd", opLen3(ssa.OpMaskedAndUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x8.MaskedAndNot", opLen3(ssa.OpMaskedAndNotUint32x8, types.TypeVec256), sys.AMD64) @@ -1034,6 +1050,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint32x8.MaskedOr", opLen3(ssa.OpMaskedOrUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x8.MaskedSub", opLen3(ssa.OpMaskedSubUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x8.MaskedXor", opLen3(ssa.OpMaskedXorUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x8.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x8.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x2.MaskedAdd", opLen3(ssa.OpMaskedAddUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x2.MaskedAnd", opLen3(ssa.OpMaskedAndUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x2.MaskedAndNot", opLen3(ssa.OpMaskedAndNotUint64x2, types.TypeVec128), sys.AMD64) @@ -1118,6 +1136,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint8x64.MaskedSaturatedAdd", opLen3(ssa.OpMaskedSaturatedAddUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x64.MaskedSaturatedSub", opLen3(ssa.OpMaskedSaturatedSubUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x64.MaskedSub", opLen3(ssa.OpMaskedSubUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.MaskedPairDotProdAccumulate", opLen4(ssa.OpMaskedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.MaskedSaturatedPairDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.MaskedPairDotProdAccumulate", opLen4(ssa.OpMaskedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.MaskedSaturatedPairDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.MaskedPairDotProdAccumulate", opLen4(ssa.OpMaskedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.MaskedSaturatedPairDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x16.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x8.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) addF(simdPackage, "Float32x4.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) addF(simdPackage, "Float32x8.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) diff --git a/src/simd/stubs_amd64.go b/src/simd/stubs_amd64.go index 83edaf2270..49af32bc4f 100644 --- a/src/simd/stubs_amd64.go +++ b/src/simd/stubs_amd64.go @@ -766,6 +766,7 @@ func (x Float64x2) AndNot(y Float64x2) Float64x2 func (x Float64x2) Div(y Float64x2) Float64x2 // DotProdBroadcast multiplies all elements and broadcasts the sum. +// Const Immediate = 127. // // Asm: VDPPD, CPU Feature: AVX func (x Float64x2) DotProdBroadcast(y Float64x2) Float64x2 @@ -4437,6 +4438,26 @@ func (x Int32x16) MaskedSub(y Int32x16, z Mask32x16) Int32x16 // Asm: VPXORD, CPU Feature: AVX512EVEX func (x Int32x16) MaskedXor(y Int32x16, z Mask32x16) Int32x16 +// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSD, CPU Feature: AVX512EVEX +func (x Int32x16) PairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16 + +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512EVEX +func (x Int32x16) SaturatedPairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX +func (x Int32x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX512EVEX +func (x Int32x16) UnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16 + // Add adds corresponding elements of two vectors. // // Asm: VPADDD, CPU Feature: AVX512EVEX @@ -4518,6 +4539,26 @@ func (x Int32x4) MaskedSub(y Int32x4, z Mask32x4) Int32x4 // Asm: VPXORD, CPU Feature: AVX512EVEX func (x Int32x4) MaskedXor(y Int32x4, z Mask32x4) Int32x4 +// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSD, CPU Feature: AVX_VNNI +func (x Int32x4) PairDotProdAccumulate(y Int32x4, z Int32x4) Int32x4 + +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSDS, CPU Feature: AVX_VNNI +func (x Int32x4) SaturatedPairDotProdAccumulate(y Int32x4, z Int32x4) Int32x4 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX_VNNI +func (x Int32x4) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint32x4, z Int32x4) Int32x4 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX_VNNI +func (x Int32x4) UnsignedSignedQuadDotProdAccumulate(y Uint32x4, z Int32x4) Int32x4 + // Add adds corresponding elements of two vectors. // // Asm: VPADDD, CPU Feature: AVX512EVEX @@ -4599,6 +4640,26 @@ func (x Int32x8) MaskedSub(y Int32x8, z Mask32x8) Int32x8 // Asm: VPXORD, CPU Feature: AVX512EVEX func (x Int32x8) MaskedXor(y Int32x8, z Mask32x8) Int32x8 +// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSD, CPU Feature: AVX_VNNI +func (x Int32x8) PairDotProdAccumulate(y Int32x8, z Int32x8) Int32x8 + +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSDS, CPU Feature: AVX_VNNI +func (x Int32x8) SaturatedPairDotProdAccumulate(y Int32x8, z Int32x8) Int32x8 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX_VNNI +func (x Int32x8) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint32x8, z Int32x8) Int32x8 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX_VNNI +func (x Int32x8) UnsignedSignedQuadDotProdAccumulate(y Uint32x8, z Int32x8) Int32x8 + // Add adds corresponding elements of two vectors. // // Asm: VPADDQ, CPU Feature: AVX512EVEX @@ -5380,6 +5441,16 @@ func (x Uint32x16) MaskedSub(y Uint32x16, z Mask32x16) Uint32x16 // Asm: VPXORD, CPU Feature: AVX512EVEX func (x Uint32x16) MaskedXor(y Uint32x16, z Mask32x16) Uint32x16 +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX +func (x Uint32x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Uint32x16 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX512EVEX +func (x Uint32x16) UnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Uint32x16 + // Add adds corresponding elements of two vectors. // // Asm: VPADDD, CPU Feature: AVX512EVEX @@ -5456,6 +5527,16 @@ func (x Uint32x4) MaskedSub(y Uint32x4, z Mask32x4) Uint32x4 // Asm: VPXORD, CPU Feature: AVX512EVEX func (x Uint32x4) MaskedXor(y Uint32x4, z Mask32x4) Uint32x4 +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX_VNNI +func (x Uint32x4) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint32x4, z Int32x4) Uint32x4 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX_VNNI +func (x Uint32x4) UnsignedSignedQuadDotProdAccumulate(y Uint32x4, z Int32x4) Uint32x4 + // Add adds corresponding elements of two vectors. // // Asm: VPADDD, CPU Feature: AVX512EVEX @@ -5532,6 +5613,16 @@ func (x Uint32x8) MaskedSub(y Uint32x8, z Mask32x8) Uint32x8 // Asm: VPXORD, CPU Feature: AVX512EVEX func (x Uint32x8) MaskedXor(y Uint32x8, z Mask32x8) Uint32x8 +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX_VNNI +func (x Uint32x8) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint32x8, z Int32x8) Uint32x8 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX_VNNI +func (x Uint32x8) UnsignedSignedQuadDotProdAccumulate(y Uint32x8, z Int32x8) Uint32x8 + // Add adds corresponding elements of two vectors. // // Asm: VPADDQ, CPU Feature: AVX512EVEX @@ -5991,6 +6082,96 @@ func (x Uint8x64) MaskedSaturatedSub(y Uint8x64, z Mask8x64) Uint8x64 // Asm: VPSUBB, CPU Feature: AVX512EVEX func (x Uint8x64) MaskedSub(y Uint8x64, z Mask8x64) Uint8x64 +// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSD, CPU Feature: AVX512EVEX +func (x Int32x16) MaskedPairDotProdAccumulate(y Int16x32, z Int32x16, u Mask32x16) Int32x16 + +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512EVEX +func (x Int32x16) MaskedSaturatedPairDotProdAccumulate(y Int16x32, z Int32x16, u Mask32x16) Int32x16 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX +func (x Int32x16) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16, u Mask32x16) Int32x16 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX512EVEX +func (x Int32x16) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16, u Mask32x16) Int32x16 + +// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSD, CPU Feature: AVX512EVEX +func (x Int32x4) MaskedPairDotProdAccumulate(y Int16x8, z Int32x4, u Mask32x4) Int32x4 + +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512EVEX +func (x Int32x4) MaskedSaturatedPairDotProdAccumulate(y Int16x8, z Int32x4, u Mask32x4) Int32x4 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX +func (x Int32x4) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4, u Mask32x4) Int32x4 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX512EVEX +func (x Int32x4) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4, u Mask32x4) Int32x4 + +// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSD, CPU Feature: AVX512EVEX +func (x Int32x8) MaskedPairDotProdAccumulate(y Int16x16, z Int32x8, u Mask32x8) Int32x8 + +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512EVEX +func (x Int32x8) MaskedSaturatedPairDotProdAccumulate(y Int16x16, z Int32x8, u Mask32x8) Int32x8 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX +func (x Int32x8) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8, u Mask32x8) Int32x8 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX512EVEX +func (x Int32x8) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8, u Mask32x8) Int32x8 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX +func (x Uint32x16) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16, u Mask32x16) Uint32x16 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX512EVEX +func (x Uint32x16) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16, u Mask32x16) Uint32x16 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX +func (x Uint32x4) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4, u Mask32x4) Uint32x4 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX512EVEX +func (x Uint32x4) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4, u Mask32x4) Uint32x4 + +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSDS, CPU Feature: AVX512EVEX +func (x Uint32x8) MaskedSaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8, u Mask32x8) Uint32x8 + +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// +// Asm: VPDPBUSD, CPU Feature: AVX512EVEX +func (x Uint32x8) MaskedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8, u Mask32x8) Uint32x8 + // CeilSuppressExceptionWithPrecision rounds elements up with specified precision, suppressing exceptions. // Const Immediate = 10. //