From 8eb5f6020e707672a846f0f83011b87e48039550 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Thu, 7 Aug 2025 17:05:50 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: API interface fixes - Absolute -> Abs - ApproximateReciprocal -> Reciprocal - Other derived apis also changed. - Round -> RoundToEven - Other derived apis also changed. - Drop DotProdBroadcast - Fused(Mul|Add)(Mul|Add)? -> remove the "Fused" - MulEvenWiden -> remove 64bit - MulLow -> Mul, add unit - PairDotProd -> DotProdPairs - make AddDotProdPairs machine ops only - peepholes will be in another CL at dev.simd. - PopCount -> OnesCount - Saturated* -> *Saturated - Fix (Add|Sub)Saturated uint mappings. - UnsignedSignedQuadDotProdAccumulate -> AddDotProdQuadruple - The "DotProdQuadruple" instruction does not exist, so no peepholes for this. This CL is generated by CL 694095. Change-Id: If4110cc04ab96240cf56f2348d35ed2a719687de Reviewed-on: https://go-review.googlesource.com/c/go/+/694115 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/simdssa.go | 308 +- .../compile/internal/ssa/_gen/simdAMD64.rules | 493 ++- .../compile/internal/ssa/_gen/simdAMD64ops.go | 41 +- .../internal/ssa/_gen/simdgenericOps.go | 437 +- src/cmd/compile/internal/ssa/opGen.go | 2629 ++++++------ src/cmd/compile/internal/ssa/rewriteAMD64.go | 3513 ++++++++--------- .../compile/internal/ssagen/simdintrinsics.go | 437 +- src/simd/ops_amd64.go | 2387 ++++++----- src/simd/simd_test.go | 19 - src/simd/ternary_test.go | 12 +- src/simd/unary_test.go | 32 +- 11 files changed, 5064 insertions(+), 5244 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index b778cd7994..274602c0a7 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -24,18 +24,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQ128, ssa.OpAMD64VPABSQ256, ssa.OpAMD64VPABSQ512, - ssa.OpAMD64VRCPPS128, - ssa.OpAMD64VRCPPS256, - ssa.OpAMD64VRCP14PS512, - ssa.OpAMD64VRCP14PD128, - ssa.OpAMD64VRCP14PD256, - ssa.OpAMD64VRCP14PD512, - ssa.OpAMD64VRSQRTPS128, - ssa.OpAMD64VRSQRTPS256, - ssa.OpAMD64VRSQRT14PS512, - ssa.OpAMD64VRSQRT14PD128, - ssa.OpAMD64VRSQRT14PD256, - ssa.OpAMD64VRSQRT14PD512, ssa.OpAMD64VCVTTPS2DQ128, ssa.OpAMD64VCVTTPS2DQ256, ssa.OpAMD64VCVTTPS2DQ512, @@ -54,6 +42,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPOPCNTQ128, ssa.OpAMD64VPOPCNTQ256, ssa.OpAMD64VPOPCNTQ512, + ssa.OpAMD64VRCPPS128, + ssa.OpAMD64VRCPPS256, + ssa.OpAMD64VRCP14PS512, + ssa.OpAMD64VRCP14PD128, + ssa.OpAMD64VRCP14PD256, + ssa.OpAMD64VRCP14PD512, + ssa.OpAMD64VRSQRTPS128, + ssa.OpAMD64VRSQRTPS256, + ssa.OpAMD64VRSQRT14PS512, + ssa.OpAMD64VRSQRT14PD128, + ssa.OpAMD64VRSQRT14PD256, + ssa.OpAMD64VRSQRT14PD512, ssa.OpAMD64VSQRTPS128, ssa.OpAMD64VSQRTPS256, ssa.OpAMD64VSQRTPS512, @@ -96,6 +96,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPADDSW128, ssa.OpAMD64VPADDSW256, ssa.OpAMD64VPADDSW512, + ssa.OpAMD64VPADDUSB128, + ssa.OpAMD64VPADDUSB256, + ssa.OpAMD64VPADDUSB512, + ssa.OpAMD64VPADDUSW128, + ssa.OpAMD64VPADDUSW256, + ssa.OpAMD64VPADDUSW512, ssa.OpAMD64VADDSUBPS128, ssa.OpAMD64VADDSUBPS256, ssa.OpAMD64VADDSUBPD128, @@ -114,12 +120,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPAVGW128, ssa.OpAMD64VPAVGW256, ssa.OpAMD64VPAVGW512, + ssa.OpAMD64VPSIGNB128, + ssa.OpAMD64VPSIGNB256, + ssa.OpAMD64VPSIGNW128, + ssa.OpAMD64VPSIGNW256, + ssa.OpAMD64VPSIGND128, + ssa.OpAMD64VPSIGND256, ssa.OpAMD64VDIVPS128, ssa.OpAMD64VDIVPS256, ssa.OpAMD64VDIVPS512, ssa.OpAMD64VDIVPD128, ssa.OpAMD64VDIVPD256, ssa.OpAMD64VDIVPD512, + ssa.OpAMD64VPMADDWD128, + ssa.OpAMD64VPMADDWD256, + ssa.OpAMD64VPMADDWD512, + ssa.OpAMD64VPMADDUBSW128, + ssa.OpAMD64VPMADDUBSW256, + ssa.OpAMD64VPMADDUBSW512, ssa.OpAMD64VPCMPEQB128, ssa.OpAMD64VPCMPEQB256, ssa.OpAMD64VPCMPEQW128, @@ -216,23 +234,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMULLQ512, ssa.OpAMD64VPMULDQ128, ssa.OpAMD64VPMULDQ256, - ssa.OpAMD64VPMULDQ512, ssa.OpAMD64VPMULUDQ128, ssa.OpAMD64VPMULUDQ256, - ssa.OpAMD64VPMULUDQ512, - ssa.OpAMD64VPMULHW128, - ssa.OpAMD64VPMULHW256, - ssa.OpAMD64VPMULHW512, ssa.OpAMD64VPMULHUW128, ssa.OpAMD64VPMULHUW256, - ssa.OpAMD64VPMULHUW512, + ssa.OpAMD64VPMULHW512, ssa.OpAMD64VPOR128, ssa.OpAMD64VPOR256, ssa.OpAMD64VPORD512, ssa.OpAMD64VPORQ512, - ssa.OpAMD64VPMADDWD128, - ssa.OpAMD64VPMADDWD256, - ssa.OpAMD64VPMADDWD512, ssa.OpAMD64VPERMB128, ssa.OpAMD64VPERMB256, ssa.OpAMD64VPERMB512, @@ -259,9 +269,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQ128, ssa.OpAMD64VPRORVQ256, ssa.OpAMD64VPRORVQ512, - ssa.OpAMD64VPMADDUBSW128, - ssa.OpAMD64VPMADDUBSW256, - ssa.OpAMD64VPMADDUBSW512, ssa.OpAMD64VSCALEFPS128, ssa.OpAMD64VSCALEFPS256, ssa.OpAMD64VSCALEFPS512, @@ -295,12 +302,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSRLVQ128, ssa.OpAMD64VPSRLVQ256, ssa.OpAMD64VPSRLVQ512, - ssa.OpAMD64VPSIGNB128, - ssa.OpAMD64VPSIGNB256, - ssa.OpAMD64VPSIGNW128, - ssa.OpAMD64VPSIGNW256, - ssa.OpAMD64VPSIGND128, - ssa.OpAMD64VPSIGND256, ssa.OpAMD64VSUBPS128, ssa.OpAMD64VSUBPS256, ssa.OpAMD64VSUBPS512, @@ -335,6 +336,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSUBSW128, ssa.OpAMD64VPSUBSW256, ssa.OpAMD64VPSUBSW512, + ssa.OpAMD64VPSUBUSB128, + ssa.OpAMD64VPSUBUSB256, + ssa.OpAMD64VPSUBUSB512, + ssa.OpAMD64VPSUBUSW128, + ssa.OpAMD64VPSUBUSW256, + ssa.OpAMD64VPSUBUSW512, ssa.OpAMD64VPXOR128, ssa.OpAMD64VPXOR256, ssa.OpAMD64VPXORD512, @@ -375,6 +382,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPADDSWMasked128, ssa.OpAMD64VPADDSWMasked256, ssa.OpAMD64VPADDSWMasked512, + ssa.OpAMD64VPADDUSBMasked128, + ssa.OpAMD64VPADDUSBMasked256, + ssa.OpAMD64VPADDUSBMasked512, + ssa.OpAMD64VPADDUSWMasked128, + ssa.OpAMD64VPADDUSWMasked256, + ssa.OpAMD64VPADDUSWMasked512, ssa.OpAMD64VPANDDMasked128, ssa.OpAMD64VPANDDMasked256, ssa.OpAMD64VPANDDMasked512, @@ -399,6 +412,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VDIVPDMasked128, ssa.OpAMD64VDIVPDMasked256, ssa.OpAMD64VDIVPDMasked512, + ssa.OpAMD64VPMADDWDMasked128, + ssa.OpAMD64VPMADDWDMasked256, + ssa.OpAMD64VPMADDWDMasked512, + ssa.OpAMD64VPMADDUBSWMasked128, + ssa.OpAMD64VPMADDUBSWMasked256, + ssa.OpAMD64VPMADDUBSWMasked512, ssa.OpAMD64VGF2P8MULBMasked128, ssa.OpAMD64VGF2P8MULBMasked256, ssa.OpAMD64VGF2P8MULBMasked512, @@ -462,17 +481,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMINUQMasked128, ssa.OpAMD64VPMINUQMasked256, ssa.OpAMD64VPMINUQMasked512, - ssa.OpAMD64VPMULDQMasked128, - ssa.OpAMD64VPMULDQMasked256, - ssa.OpAMD64VPMULDQMasked512, - ssa.OpAMD64VPMULUDQMasked128, - ssa.OpAMD64VPMULUDQMasked256, - ssa.OpAMD64VPMULUDQMasked512, - ssa.OpAMD64VPMULHWMasked128, - ssa.OpAMD64VPMULHWMasked256, - ssa.OpAMD64VPMULHWMasked512, ssa.OpAMD64VPMULHUWMasked128, - ssa.OpAMD64VPMULHUWMasked256, + ssa.OpAMD64VPMULHWMasked256, ssa.OpAMD64VPMULHUWMasked512, ssa.OpAMD64VMULPSMasked128, ssa.OpAMD64VMULPSMasked256, @@ -495,9 +505,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPORQMasked128, ssa.OpAMD64VPORQMasked256, ssa.OpAMD64VPORQMasked512, - ssa.OpAMD64VPMADDWDMasked128, - ssa.OpAMD64VPMADDWDMasked256, - ssa.OpAMD64VPMADDWDMasked512, ssa.OpAMD64VPERMBMasked128, ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked512, @@ -524,9 +531,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQMasked128, ssa.OpAMD64VPRORVQMasked256, ssa.OpAMD64VPRORVQMasked512, - ssa.OpAMD64VPMADDUBSWMasked128, - ssa.OpAMD64VPMADDUBSWMasked256, - ssa.OpAMD64VPMADDUBSWMasked512, ssa.OpAMD64VSCALEFPSMasked128, ssa.OpAMD64VSCALEFPSMasked256, ssa.OpAMD64VSCALEFPSMasked512, @@ -584,6 +588,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSUBSWMasked128, ssa.OpAMD64VPSUBSWMasked256, ssa.OpAMD64VPSUBSWMasked512, + ssa.OpAMD64VPSUBUSBMasked128, + ssa.OpAMD64VPSUBUSBMasked256, + ssa.OpAMD64VPSUBUSBMasked512, + ssa.OpAMD64VPSUBUSWMasked128, + ssa.OpAMD64VPSUBUSWMasked256, + ssa.OpAMD64VPSUBUSWMasked512, ssa.OpAMD64VPXORDMasked128, ssa.OpAMD64VPXORDMasked256, ssa.OpAMD64VPXORDMasked512, @@ -608,18 +618,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQMasked128, ssa.OpAMD64VPABSQMasked256, ssa.OpAMD64VPABSQMasked512, - ssa.OpAMD64VRCP14PSMasked128, - ssa.OpAMD64VRCP14PSMasked256, - ssa.OpAMD64VRCP14PSMasked512, - ssa.OpAMD64VRCP14PDMasked128, - ssa.OpAMD64VRCP14PDMasked256, - ssa.OpAMD64VRCP14PDMasked512, - ssa.OpAMD64VRSQRT14PSMasked128, - ssa.OpAMD64VRSQRT14PSMasked256, - ssa.OpAMD64VRSQRT14PSMasked512, - ssa.OpAMD64VRSQRT14PDMasked128, - ssa.OpAMD64VRSQRT14PDMasked256, - ssa.OpAMD64VRSQRT14PDMasked512, ssa.OpAMD64VCOMPRESSPSMasked128, ssa.OpAMD64VCOMPRESSPSMasked256, ssa.OpAMD64VCOMPRESSPSMasked512, @@ -674,6 +672,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPOPCNTQMasked128, ssa.OpAMD64VPOPCNTQMasked256, ssa.OpAMD64VPOPCNTQMasked512, + ssa.OpAMD64VRCP14PSMasked128, + ssa.OpAMD64VRCP14PSMasked256, + ssa.OpAMD64VRCP14PSMasked512, + ssa.OpAMD64VRCP14PDMasked128, + ssa.OpAMD64VRCP14PDMasked256, + ssa.OpAMD64VRCP14PDMasked512, + ssa.OpAMD64VRSQRT14PSMasked128, + ssa.OpAMD64VRSQRT14PSMasked256, + ssa.OpAMD64VRSQRT14PSMasked512, + ssa.OpAMD64VRSQRT14PDMasked128, + ssa.OpAMD64VRSQRT14PDMasked256, + ssa.OpAMD64VRSQRT14PDMasked512, ssa.OpAMD64VSQRTPSMasked128, ssa.OpAMD64VSQRTPSMasked256, ssa.OpAMD64VSQRTPSMasked512, @@ -800,10 +810,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSRAQMasked512const: p = simdVkvImm8(s, v) - case ssa.OpAMD64VDPPS128, - ssa.OpAMD64VDPPS256, - ssa.OpAMD64VDPPD128, - ssa.OpAMD64VCMPPS128, + case ssa.OpAMD64VCMPPS128, ssa.OpAMD64VCMPPS256, ssa.OpAMD64VCMPPD128, ssa.OpAMD64VCMPPD256, @@ -900,6 +907,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { case ssa.OpAMD64VPDPWSSD128, ssa.OpAMD64VPDPWSSD256, ssa.OpAMD64VPDPWSSD512, + ssa.OpAMD64VPDPWSSDS128, + ssa.OpAMD64VPDPWSSDS256, + ssa.OpAMD64VPDPWSSDS512, + ssa.OpAMD64VPDPBUSD128, + ssa.OpAMD64VPDPBUSD256, + ssa.OpAMD64VPDPBUSD512, + ssa.OpAMD64VPDPBUSDS128, + ssa.OpAMD64VPDPBUSDS256, + ssa.OpAMD64VPDPBUSDS512, ssa.OpAMD64VFMADD213PS128, ssa.OpAMD64VFMADD213PS256, ssa.OpAMD64VFMADD213PS512, @@ -936,12 +952,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMI2Q256, ssa.OpAMD64VPERMI2PD512, ssa.OpAMD64VPERMI2Q512, - ssa.OpAMD64VPDPWSSDS128, - ssa.OpAMD64VPDPWSSDS256, - ssa.OpAMD64VPDPWSSDS512, - ssa.OpAMD64VPDPBUSDS128, - ssa.OpAMD64VPDPBUSDS256, - ssa.OpAMD64VPDPBUSDS512, ssa.OpAMD64VPSHLDVW128, ssa.OpAMD64VPSHLDVW256, ssa.OpAMD64VPSHLDVW512, @@ -959,15 +969,21 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSHRDVD512, ssa.OpAMD64VPSHRDVQ128, ssa.OpAMD64VPSHRDVQ256, - ssa.OpAMD64VPSHRDVQ512, - ssa.OpAMD64VPDPBUSD128, - ssa.OpAMD64VPDPBUSD256, - ssa.OpAMD64VPDPBUSD512: + ssa.OpAMD64VPSHRDVQ512: p = simdV31ResultInArg0(s, v) case ssa.OpAMD64VPDPWSSDMasked128, ssa.OpAMD64VPDPWSSDMasked256, ssa.OpAMD64VPDPWSSDMasked512, + ssa.OpAMD64VPDPWSSDSMasked128, + ssa.OpAMD64VPDPWSSDSMasked256, + ssa.OpAMD64VPDPWSSDSMasked512, + ssa.OpAMD64VPDPBUSDMasked128, + ssa.OpAMD64VPDPBUSDMasked256, + ssa.OpAMD64VPDPBUSDMasked512, + ssa.OpAMD64VPDPBUSDSMasked128, + ssa.OpAMD64VPDPBUSDSMasked256, + ssa.OpAMD64VPDPBUSDSMasked512, ssa.OpAMD64VFMADD213PSMasked128, ssa.OpAMD64VFMADD213PSMasked256, ssa.OpAMD64VFMADD213PSMasked512, @@ -1004,12 +1020,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMI2QMasked256, ssa.OpAMD64VPERMI2PDMasked512, ssa.OpAMD64VPERMI2QMasked512, - ssa.OpAMD64VPDPWSSDSMasked128, - ssa.OpAMD64VPDPWSSDSMasked256, - ssa.OpAMD64VPDPWSSDSMasked512, - ssa.OpAMD64VPDPBUSDSMasked128, - ssa.OpAMD64VPDPBUSDSMasked256, - ssa.OpAMD64VPDPBUSDSMasked512, ssa.OpAMD64VPSHLDVWMasked128, ssa.OpAMD64VPSHLDVWMasked256, ssa.OpAMD64VPSHLDVWMasked512, @@ -1027,10 +1037,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSHRDVDMasked512, ssa.OpAMD64VPSHRDVQMasked128, ssa.OpAMD64VPSHRDVQMasked256, - ssa.OpAMD64VPSHRDVQMasked512, - ssa.OpAMD64VPDPBUSDMasked128, - ssa.OpAMD64VPDPBUSDMasked256, - ssa.OpAMD64VPDPBUSDMasked512: + ssa.OpAMD64VPSHRDVQMasked512: p = simdV3kvResultInArg0(s, v) case ssa.OpAMD64VPSLLW128, @@ -1151,6 +1158,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPDPWSSDMasked128, ssa.OpAMD64VPDPWSSDMasked256, ssa.OpAMD64VPDPWSSDMasked512, + ssa.OpAMD64VPDPWSSDSMasked128, + ssa.OpAMD64VPDPWSSDSMasked256, + ssa.OpAMD64VPDPWSSDSMasked512, + ssa.OpAMD64VPDPBUSDMasked128, + ssa.OpAMD64VPDPBUSDMasked256, + ssa.OpAMD64VPDPBUSDMasked512, + ssa.OpAMD64VPDPBUSDSMasked128, + ssa.OpAMD64VPDPBUSDSMasked256, + ssa.OpAMD64VPDPBUSDSMasked512, ssa.OpAMD64VADDPSMasked128, ssa.OpAMD64VADDPSMasked256, ssa.OpAMD64VADDPSMasked512, @@ -1175,6 +1191,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPADDSWMasked128, ssa.OpAMD64VPADDSWMasked256, ssa.OpAMD64VPADDSWMasked512, + ssa.OpAMD64VPADDUSBMasked128, + ssa.OpAMD64VPADDUSBMasked256, + ssa.OpAMD64VPADDUSBMasked512, + ssa.OpAMD64VPADDUSWMasked128, + ssa.OpAMD64VPADDUSWMasked256, + ssa.OpAMD64VPADDUSWMasked512, ssa.OpAMD64VPANDDMasked128, ssa.OpAMD64VPANDDMasked256, ssa.OpAMD64VPANDDMasked512, @@ -1187,18 +1209,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPANDNQMasked128, ssa.OpAMD64VPANDNQMasked256, ssa.OpAMD64VPANDNQMasked512, - ssa.OpAMD64VRCP14PSMasked128, - ssa.OpAMD64VRCP14PSMasked256, - ssa.OpAMD64VRCP14PSMasked512, - ssa.OpAMD64VRCP14PDMasked128, - ssa.OpAMD64VRCP14PDMasked256, - ssa.OpAMD64VRCP14PDMasked512, - ssa.OpAMD64VRSQRT14PSMasked128, - ssa.OpAMD64VRSQRT14PSMasked256, - ssa.OpAMD64VRSQRT14PSMasked512, - ssa.OpAMD64VRSQRT14PDMasked128, - ssa.OpAMD64VRSQRT14PDMasked256, - ssa.OpAMD64VRSQRT14PDMasked512, ssa.OpAMD64VPAVGBMasked128, ssa.OpAMD64VPAVGBMasked256, ssa.OpAMD64VPAVGBMasked512, @@ -1247,6 +1257,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VDIVPDMasked128, ssa.OpAMD64VDIVPDMasked256, ssa.OpAMD64VDIVPDMasked512, + ssa.OpAMD64VPMADDWDMasked128, + ssa.OpAMD64VPMADDWDMasked256, + ssa.OpAMD64VPMADDWDMasked512, + ssa.OpAMD64VPMADDUBSWMasked128, + ssa.OpAMD64VPMADDUBSWMasked256, + ssa.OpAMD64VPMADDUBSWMasked512, ssa.OpAMD64VEXPANDPSMasked128, ssa.OpAMD64VEXPANDPSMasked256, ssa.OpAMD64VEXPANDPSMasked512, @@ -1265,24 +1281,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPEXPANDQMasked128, ssa.OpAMD64VPEXPANDQMasked256, ssa.OpAMD64VPEXPANDQMasked512, - ssa.OpAMD64VFMADD213PSMasked128, - ssa.OpAMD64VFMADD213PSMasked256, - ssa.OpAMD64VFMADD213PSMasked512, - ssa.OpAMD64VFMADD213PDMasked128, - ssa.OpAMD64VFMADD213PDMasked256, - ssa.OpAMD64VFMADD213PDMasked512, - ssa.OpAMD64VFMADDSUB213PSMasked128, - ssa.OpAMD64VFMADDSUB213PSMasked256, - ssa.OpAMD64VFMADDSUB213PSMasked512, - ssa.OpAMD64VFMADDSUB213PDMasked128, - ssa.OpAMD64VFMADDSUB213PDMasked256, - ssa.OpAMD64VFMADDSUB213PDMasked512, - ssa.OpAMD64VFMSUBADD213PSMasked128, - ssa.OpAMD64VFMSUBADD213PSMasked256, - ssa.OpAMD64VFMSUBADD213PSMasked512, - ssa.OpAMD64VFMSUBADD213PDMasked128, - ssa.OpAMD64VFMSUBADD213PDMasked256, - ssa.OpAMD64VFMSUBADD213PDMasked512, ssa.OpAMD64VGF2P8AFFINEINVQBMasked128, ssa.OpAMD64VGF2P8AFFINEINVQBMasked256, ssa.OpAMD64VGF2P8AFFINEINVQBMasked512, @@ -1352,17 +1350,20 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMINUQMasked128, ssa.OpAMD64VPMINUQMasked256, ssa.OpAMD64VPMINUQMasked512, - ssa.OpAMD64VPMULDQMasked128, - ssa.OpAMD64VPMULDQMasked256, - ssa.OpAMD64VPMULDQMasked512, - ssa.OpAMD64VPMULUDQMasked128, - ssa.OpAMD64VPMULUDQMasked256, - ssa.OpAMD64VPMULUDQMasked512, - ssa.OpAMD64VPMULHWMasked128, - ssa.OpAMD64VPMULHWMasked256, - ssa.OpAMD64VPMULHWMasked512, + ssa.OpAMD64VFMADD213PSMasked128, + ssa.OpAMD64VFMADD213PSMasked256, + ssa.OpAMD64VFMADD213PSMasked512, + ssa.OpAMD64VFMADD213PDMasked128, + ssa.OpAMD64VFMADD213PDMasked256, + ssa.OpAMD64VFMADD213PDMasked512, + ssa.OpAMD64VFMADDSUB213PSMasked128, + ssa.OpAMD64VFMADDSUB213PSMasked256, + ssa.OpAMD64VFMADDSUB213PSMasked512, + ssa.OpAMD64VFMADDSUB213PDMasked128, + ssa.OpAMD64VFMADDSUB213PDMasked256, + ssa.OpAMD64VFMADDSUB213PDMasked512, ssa.OpAMD64VPMULHUWMasked128, - ssa.OpAMD64VPMULHUWMasked256, + ssa.OpAMD64VPMULHWMasked256, ssa.OpAMD64VPMULHUWMasked512, ssa.OpAMD64VMULPSMasked128, ssa.OpAMD64VMULPSMasked256, @@ -1379,15 +1380,30 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMULLQMasked128, ssa.OpAMD64VPMULLQMasked256, ssa.OpAMD64VPMULLQMasked512, + ssa.OpAMD64VFMSUBADD213PSMasked128, + ssa.OpAMD64VFMSUBADD213PSMasked256, + ssa.OpAMD64VFMSUBADD213PSMasked512, + ssa.OpAMD64VFMSUBADD213PDMasked128, + ssa.OpAMD64VFMSUBADD213PDMasked256, + ssa.OpAMD64VFMSUBADD213PDMasked512, + ssa.OpAMD64VPOPCNTBMasked128, + ssa.OpAMD64VPOPCNTBMasked256, + ssa.OpAMD64VPOPCNTBMasked512, + ssa.OpAMD64VPOPCNTWMasked128, + ssa.OpAMD64VPOPCNTWMasked256, + ssa.OpAMD64VPOPCNTWMasked512, + ssa.OpAMD64VPOPCNTDMasked128, + ssa.OpAMD64VPOPCNTDMasked256, + ssa.OpAMD64VPOPCNTDMasked512, + ssa.OpAMD64VPOPCNTQMasked128, + ssa.OpAMD64VPOPCNTQMasked256, + ssa.OpAMD64VPOPCNTQMasked512, ssa.OpAMD64VPORDMasked128, ssa.OpAMD64VPORDMasked256, ssa.OpAMD64VPORDMasked512, ssa.OpAMD64VPORQMasked128, ssa.OpAMD64VPORQMasked256, ssa.OpAMD64VPORQMasked512, - ssa.OpAMD64VPMADDWDMasked128, - ssa.OpAMD64VPMADDWDMasked256, - ssa.OpAMD64VPMADDWDMasked512, ssa.OpAMD64VPERMI2BMasked128, ssa.OpAMD64VPERMI2BMasked256, ssa.OpAMD64VPERMI2BMasked512, @@ -1420,18 +1436,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMQMasked256, ssa.OpAMD64VPERMPDMasked512, ssa.OpAMD64VPERMQMasked512, - ssa.OpAMD64VPOPCNTBMasked128, - ssa.OpAMD64VPOPCNTBMasked256, - ssa.OpAMD64VPOPCNTBMasked512, - ssa.OpAMD64VPOPCNTWMasked128, - ssa.OpAMD64VPOPCNTWMasked256, - ssa.OpAMD64VPOPCNTWMasked512, - ssa.OpAMD64VPOPCNTDMasked128, - ssa.OpAMD64VPOPCNTDMasked256, - ssa.OpAMD64VPOPCNTDMasked512, - ssa.OpAMD64VPOPCNTQMasked128, - ssa.OpAMD64VPOPCNTQMasked256, - ssa.OpAMD64VPOPCNTQMasked512, + ssa.OpAMD64VRCP14PSMasked128, + ssa.OpAMD64VRCP14PSMasked256, + ssa.OpAMD64VRCP14PSMasked512, + ssa.OpAMD64VRCP14PDMasked128, + ssa.OpAMD64VRCP14PDMasked256, + ssa.OpAMD64VRCP14PDMasked512, + ssa.OpAMD64VRSQRT14PSMasked128, + ssa.OpAMD64VRSQRT14PSMasked256, + ssa.OpAMD64VRSQRT14PSMasked512, + ssa.OpAMD64VRSQRT14PDMasked128, + ssa.OpAMD64VRSQRT14PDMasked256, + ssa.OpAMD64VRSQRT14PDMasked512, ssa.OpAMD64VPROLDMasked128, ssa.OpAMD64VPROLDMasked256, ssa.OpAMD64VPROLDMasked512, @@ -1456,15 +1472,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQMasked128, ssa.OpAMD64VPRORVQMasked256, ssa.OpAMD64VPRORVQMasked512, - ssa.OpAMD64VPDPWSSDSMasked128, - ssa.OpAMD64VPDPWSSDSMasked256, - ssa.OpAMD64VPDPWSSDSMasked512, - ssa.OpAMD64VPMADDUBSWMasked128, - ssa.OpAMD64VPMADDUBSWMasked256, - ssa.OpAMD64VPMADDUBSWMasked512, - ssa.OpAMD64VPDPBUSDSMasked128, - ssa.OpAMD64VPDPBUSDSMasked256, - ssa.OpAMD64VPDPBUSDSMasked512, ssa.OpAMD64VSCALEFPSMasked128, ssa.OpAMD64VSCALEFPSMasked256, ssa.OpAMD64VSCALEFPSMasked512, @@ -1591,9 +1598,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSUBSWMasked128, ssa.OpAMD64VPSUBSWMasked256, ssa.OpAMD64VPSUBSWMasked512, - ssa.OpAMD64VPDPBUSDMasked128, - ssa.OpAMD64VPDPBUSDMasked256, - ssa.OpAMD64VPDPBUSDMasked512, + ssa.OpAMD64VPSUBUSBMasked128, + ssa.OpAMD64VPSUBUSBMasked256, + ssa.OpAMD64VPSUBUSBMasked512, + ssa.OpAMD64VPSUBUSWMasked128, + ssa.OpAMD64VPSUBUSWMasked256, + ssa.OpAMD64VPSUBUSWMasked512, ssa.OpAMD64VPXORDMasked128, ssa.OpAMD64VPXORDMasked256, ssa.OpAMD64VPXORDMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index ae29a9117e..e294836cd2 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -1,29 +1,29 @@ // Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT. -(AbsoluteInt8x16 ...) => (VPABSB128 ...) -(AbsoluteInt8x32 ...) => (VPABSB256 ...) -(AbsoluteInt8x64 ...) => (VPABSB512 ...) -(AbsoluteInt16x8 ...) => (VPABSW128 ...) -(AbsoluteInt16x16 ...) => (VPABSW256 ...) -(AbsoluteInt16x32 ...) => (VPABSW512 ...) -(AbsoluteInt32x4 ...) => (VPABSD128 ...) -(AbsoluteInt32x8 ...) => (VPABSD256 ...) -(AbsoluteInt32x16 ...) => (VPABSD512 ...) -(AbsoluteInt64x2 ...) => (VPABSQ128 ...) -(AbsoluteInt64x4 ...) => (VPABSQ256 ...) -(AbsoluteInt64x8 ...) => (VPABSQ512 ...) -(AbsoluteMaskedInt8x16 x mask) => (VPABSBMasked128 x (VPMOVVec8x16ToM mask)) -(AbsoluteMaskedInt8x32 x mask) => (VPABSBMasked256 x (VPMOVVec8x32ToM mask)) -(AbsoluteMaskedInt8x64 x mask) => (VPABSBMasked512 x (VPMOVVec8x64ToM mask)) -(AbsoluteMaskedInt16x8 x mask) => (VPABSWMasked128 x (VPMOVVec16x8ToM mask)) -(AbsoluteMaskedInt16x16 x mask) => (VPABSWMasked256 x (VPMOVVec16x16ToM mask)) -(AbsoluteMaskedInt16x32 x mask) => (VPABSWMasked512 x (VPMOVVec16x32ToM mask)) -(AbsoluteMaskedInt32x4 x mask) => (VPABSDMasked128 x (VPMOVVec32x4ToM mask)) -(AbsoluteMaskedInt32x8 x mask) => (VPABSDMasked256 x (VPMOVVec32x8ToM mask)) -(AbsoluteMaskedInt32x16 x mask) => (VPABSDMasked512 x (VPMOVVec32x16ToM mask)) -(AbsoluteMaskedInt64x2 x mask) => (VPABSQMasked128 x (VPMOVVec64x2ToM mask)) -(AbsoluteMaskedInt64x4 x mask) => (VPABSQMasked256 x (VPMOVVec64x4ToM mask)) -(AbsoluteMaskedInt64x8 x mask) => (VPABSQMasked512 x (VPMOVVec64x8ToM mask)) +(AbsInt8x16 ...) => (VPABSB128 ...) +(AbsInt8x32 ...) => (VPABSB256 ...) +(AbsInt8x64 ...) => (VPABSB512 ...) +(AbsInt16x8 ...) => (VPABSW128 ...) +(AbsInt16x16 ...) => (VPABSW256 ...) +(AbsInt16x32 ...) => (VPABSW512 ...) +(AbsInt32x4 ...) => (VPABSD128 ...) +(AbsInt32x8 ...) => (VPABSD256 ...) +(AbsInt32x16 ...) => (VPABSD512 ...) +(AbsInt64x2 ...) => (VPABSQ128 ...) +(AbsInt64x4 ...) => (VPABSQ256 ...) +(AbsInt64x8 ...) => (VPABSQ512 ...) +(AbsMaskedInt8x16 x mask) => (VPABSBMasked128 x (VPMOVVec8x16ToM mask)) +(AbsMaskedInt8x32 x mask) => (VPABSBMasked256 x (VPMOVVec8x32ToM mask)) +(AbsMaskedInt8x64 x mask) => (VPABSBMasked512 x (VPMOVVec8x64ToM mask)) +(AbsMaskedInt16x8 x mask) => (VPABSWMasked128 x (VPMOVVec16x8ToM mask)) +(AbsMaskedInt16x16 x mask) => (VPABSWMasked256 x (VPMOVVec16x16ToM mask)) +(AbsMaskedInt16x32 x mask) => (VPABSWMasked512 x (VPMOVVec16x32ToM mask)) +(AbsMaskedInt32x4 x mask) => (VPABSDMasked128 x (VPMOVVec32x4ToM mask)) +(AbsMaskedInt32x8 x mask) => (VPABSDMasked256 x (VPMOVVec32x8ToM mask)) +(AbsMaskedInt32x16 x mask) => (VPABSDMasked512 x (VPMOVVec32x16ToM mask)) +(AbsMaskedInt64x2 x mask) => (VPABSQMasked128 x (VPMOVVec64x2ToM mask)) +(AbsMaskedInt64x4 x mask) => (VPABSQMasked256 x (VPMOVVec64x4ToM mask)) +(AbsMaskedInt64x8 x mask) => (VPABSQMasked512 x (VPMOVVec64x8ToM mask)) (AddFloat32x4 ...) => (VADDPS128 ...) (AddFloat32x8 ...) => (VADDPS256 ...) (AddFloat32x16 ...) => (VADDPS512 ...) @@ -54,12 +54,24 @@ (AddUint64x2 ...) => (VPADDQ128 ...) (AddUint64x4 ...) => (VPADDQ256 ...) (AddUint64x8 ...) => (VPADDQ512 ...) -(AddDotProdInt32x4 ...) => (VPDPWSSD128 ...) -(AddDotProdInt32x8 ...) => (VPDPWSSD256 ...) -(AddDotProdInt32x16 ...) => (VPDPWSSD512 ...) -(AddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM mask)) -(AddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM mask)) -(AddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM mask)) +(AddDotProdPairsSaturatedInt32x4 ...) => (VPDPWSSDS128 ...) +(AddDotProdPairsSaturatedInt32x8 ...) => (VPDPWSSDS256 ...) +(AddDotProdPairsSaturatedInt32x16 ...) => (VPDPWSSDS512 ...) +(AddDotProdPairsSaturatedMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) +(AddDotProdPairsSaturatedMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) +(AddDotProdPairsSaturatedMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) +(AddDotProdQuadrupleInt32x4 ...) => (VPDPBUSD128 ...) +(AddDotProdQuadrupleInt32x8 ...) => (VPDPBUSD256 ...) +(AddDotProdQuadrupleInt32x16 ...) => (VPDPBUSD512 ...) +(AddDotProdQuadrupleMaskedInt32x4 x y z mask) => (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) +(AddDotProdQuadrupleMaskedInt32x8 x y z mask) => (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) +(AddDotProdQuadrupleMaskedInt32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) +(AddDotProdQuadrupleSaturatedInt32x4 ...) => (VPDPBUSDS128 ...) +(AddDotProdQuadrupleSaturatedInt32x8 ...) => (VPDPBUSDS256 ...) +(AddDotProdQuadrupleSaturatedInt32x16 ...) => (VPDPBUSDS512 ...) +(AddDotProdQuadrupleSaturatedMaskedInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) +(AddDotProdQuadrupleSaturatedMaskedInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) +(AddDotProdQuadrupleSaturatedMaskedInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) (AddMaskedFloat32x4 x y mask) => (VADDPSMasked128 x y (VPMOVVec32x4ToM mask)) (AddMaskedFloat32x8 x y mask) => (VADDPSMasked256 x y (VPMOVVec32x8ToM mask)) (AddMaskedFloat32x16 x y mask) => (VADDPSMasked512 x y (VPMOVVec32x16ToM mask)) @@ -110,24 +122,24 @@ (AddSaturatedInt16x8 ...) => (VPADDSW128 ...) (AddSaturatedInt16x16 ...) => (VPADDSW256 ...) (AddSaturatedInt16x32 ...) => (VPADDSW512 ...) -(AddSaturatedUint8x16 ...) => (VPADDSB128 ...) -(AddSaturatedUint8x32 ...) => (VPADDSB256 ...) -(AddSaturatedUint8x64 ...) => (VPADDSB512 ...) -(AddSaturatedUint16x8 ...) => (VPADDSW128 ...) -(AddSaturatedUint16x16 ...) => (VPADDSW256 ...) -(AddSaturatedUint16x32 ...) => (VPADDSW512 ...) +(AddSaturatedUint8x16 ...) => (VPADDUSB128 ...) +(AddSaturatedUint8x32 ...) => (VPADDUSB256 ...) +(AddSaturatedUint8x64 ...) => (VPADDUSB512 ...) +(AddSaturatedUint16x8 ...) => (VPADDUSW128 ...) +(AddSaturatedUint16x16 ...) => (VPADDUSW256 ...) +(AddSaturatedUint16x32 ...) => (VPADDUSW512 ...) (AddSaturatedMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) (AddSaturatedMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) (AddSaturatedMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) (AddSaturatedMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) (AddSaturatedMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) (AddSaturatedMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) -(AddSaturatedMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) -(AddSaturatedMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) -(AddSaturatedMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) -(AddSaturatedMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) -(AddSaturatedMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) -(AddSaturatedMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) +(AddSaturatedMaskedUint8x16 x y mask) => (VPADDUSBMasked128 x y (VPMOVVec8x16ToM mask)) +(AddSaturatedMaskedUint8x32 x y mask) => (VPADDUSBMasked256 x y (VPMOVVec8x32ToM mask)) +(AddSaturatedMaskedUint8x64 x y mask) => (VPADDUSBMasked512 x y (VPMOVVec8x64ToM mask)) +(AddSaturatedMaskedUint16x8 x y mask) => (VPADDUSWMasked128 x y (VPMOVVec16x8ToM mask)) +(AddSaturatedMaskedUint16x16 x y mask) => (VPADDUSWMasked256 x y (VPMOVVec16x16ToM mask)) +(AddSaturatedMaskedUint16x32 x y mask) => (VPADDUSWMasked512 x y (VPMOVVec16x32ToM mask)) (AddSubFloat32x4 ...) => (VADDSUBPS128 ...) (AddSubFloat32x8 ...) => (VADDSUBPS256 ...) (AddSubFloat64x2 ...) => (VADDSUBPD128 ...) @@ -204,30 +216,6 @@ (AndNotMaskedUint64x2 x y mask) => (VPANDNQMasked128 x y (VPMOVVec64x2ToM mask)) (AndNotMaskedUint64x4 x y mask) => (VPANDNQMasked256 x y (VPMOVVec64x4ToM mask)) (AndNotMaskedUint64x8 x y mask) => (VPANDNQMasked512 x y (VPMOVVec64x8ToM mask)) -(ApproximateReciprocalFloat32x4 ...) => (VRCPPS128 ...) -(ApproximateReciprocalFloat32x8 ...) => (VRCPPS256 ...) -(ApproximateReciprocalFloat32x16 ...) => (VRCP14PS512 ...) -(ApproximateReciprocalFloat64x2 ...) => (VRCP14PD128 ...) -(ApproximateReciprocalFloat64x4 ...) => (VRCP14PD256 ...) -(ApproximateReciprocalFloat64x8 ...) => (VRCP14PD512 ...) -(ApproximateReciprocalMaskedFloat32x4 x mask) => (VRCP14PSMasked128 x (VPMOVVec32x4ToM mask)) -(ApproximateReciprocalMaskedFloat32x8 x mask) => (VRCP14PSMasked256 x (VPMOVVec32x8ToM mask)) -(ApproximateReciprocalMaskedFloat32x16 x mask) => (VRCP14PSMasked512 x (VPMOVVec32x16ToM mask)) -(ApproximateReciprocalMaskedFloat64x2 x mask) => (VRCP14PDMasked128 x (VPMOVVec64x2ToM mask)) -(ApproximateReciprocalMaskedFloat64x4 x mask) => (VRCP14PDMasked256 x (VPMOVVec64x4ToM mask)) -(ApproximateReciprocalMaskedFloat64x8 x mask) => (VRCP14PDMasked512 x (VPMOVVec64x8ToM mask)) -(ApproximateReciprocalOfSqrtFloat32x4 ...) => (VRSQRTPS128 ...) -(ApproximateReciprocalOfSqrtFloat32x8 ...) => (VRSQRTPS256 ...) -(ApproximateReciprocalOfSqrtFloat32x16 ...) => (VRSQRT14PS512 ...) -(ApproximateReciprocalOfSqrtFloat64x2 ...) => (VRSQRT14PD128 ...) -(ApproximateReciprocalOfSqrtFloat64x4 ...) => (VRSQRT14PD256 ...) -(ApproximateReciprocalOfSqrtFloat64x8 ...) => (VRSQRT14PD512 ...) -(ApproximateReciprocalOfSqrtMaskedFloat32x4 x mask) => (VRSQRT14PSMasked128 x (VPMOVVec32x4ToM mask)) -(ApproximateReciprocalOfSqrtMaskedFloat32x8 x mask) => (VRSQRT14PSMasked256 x (VPMOVVec32x8ToM mask)) -(ApproximateReciprocalOfSqrtMaskedFloat32x16 x mask) => (VRSQRT14PSMasked512 x (VPMOVVec32x16ToM mask)) -(ApproximateReciprocalOfSqrtMaskedFloat64x2 x mask) => (VRSQRT14PDMasked128 x (VPMOVVec64x2ToM mask)) -(ApproximateReciprocalOfSqrtMaskedFloat64x4 x mask) => (VRSQRT14PDMasked256 x (VPMOVVec64x4ToM mask)) -(ApproximateReciprocalOfSqrtMaskedFloat64x8 x mask) => (VRSQRT14PDMasked512 x (VPMOVVec64x8ToM mask)) (AverageUint8x16 ...) => (VPAVGB128 ...) (AverageUint8x32 ...) => (VPAVGB256 ...) (AverageUint8x64 ...) => (VPAVGB512 ...) @@ -310,6 +298,12 @@ (ConvertToUint32MaskedFloat32x4 x mask) => (VCVTPS2UDQMasked128 x (VPMOVVec32x4ToM mask)) (ConvertToUint32MaskedFloat32x8 x mask) => (VCVTPS2UDQMasked256 x (VPMOVVec32x8ToM mask)) (ConvertToUint32MaskedFloat32x16 x mask) => (VCVTPS2UDQMasked512 x (VPMOVVec32x16ToM mask)) +(CopySignInt8x16 ...) => (VPSIGNB128 ...) +(CopySignInt8x32 ...) => (VPSIGNB256 ...) +(CopySignInt16x8 ...) => (VPSIGNW128 ...) +(CopySignInt16x16 ...) => (VPSIGNW256 ...) +(CopySignInt32x4 ...) => (VPSIGND128 ...) +(CopySignInt32x8 ...) => (VPSIGND256 ...) (DivFloat32x4 ...) => (VDIVPS128 ...) (DivFloat32x8 ...) => (VDIVPS256 ...) (DivFloat32x16 ...) => (VDIVPS512 ...) @@ -322,9 +316,18 @@ (DivMaskedFloat64x2 x y mask) => (VDIVPDMasked128 x y (VPMOVVec64x2ToM mask)) (DivMaskedFloat64x4 x y mask) => (VDIVPDMasked256 x y (VPMOVVec64x4ToM mask)) (DivMaskedFloat64x8 x y mask) => (VDIVPDMasked512 x y (VPMOVVec64x8ToM mask)) -(DotProdBroadcastFloat32x4 x y) => (VDPPS128 [127] x y) -(DotProdBroadcastFloat32x8 x y) => (VDPPS256 [127] x y) -(DotProdBroadcastFloat64x2 x y) => (VDPPD128 [127] x y) +(DotProdPairsInt16x8 ...) => (VPMADDWD128 ...) +(DotProdPairsInt16x16 ...) => (VPMADDWD256 ...) +(DotProdPairsInt16x32 ...) => (VPMADDWD512 ...) +(DotProdPairsMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM mask)) +(DotProdPairsMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM mask)) +(DotProdPairsMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM mask)) +(DotProdPairsSaturatedUint8x16 ...) => (VPMADDUBSW128 ...) +(DotProdPairsSaturatedUint8x32 ...) => (VPMADDUBSW256 ...) +(DotProdPairsSaturatedUint8x64 ...) => (VPMADDUBSW512 ...) +(DotProdPairsSaturatedMaskedUint8x16 x y mask) => (VPMADDUBSWMasked128 x y (VPMOVVec16x8ToM mask)) +(DotProdPairsSaturatedMaskedUint8x32 x y mask) => (VPMADDUBSWMasked256 x y (VPMOVVec16x16ToM mask)) +(DotProdPairsSaturatedMaskedUint8x64 x y mask) => (VPMADDUBSWMasked512 x y (VPMOVVec16x32ToM mask)) (EqualFloat32x4 x y) => (VCMPPS128 [0] x y) (EqualFloat32x8 x y) => (VCMPPS256 [0] x y) (EqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [0] x y)) @@ -443,42 +446,6 @@ (FloorScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM mask)) (FloorScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM mask)) (FloorScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM mask)) -(FusedMultiplyAddFloat32x4 ...) => (VFMADD213PS128 ...) -(FusedMultiplyAddFloat32x8 ...) => (VFMADD213PS256 ...) -(FusedMultiplyAddFloat32x16 ...) => (VFMADD213PS512 ...) -(FusedMultiplyAddFloat64x2 ...) => (VFMADD213PD128 ...) -(FusedMultiplyAddFloat64x4 ...) => (VFMADD213PD256 ...) -(FusedMultiplyAddFloat64x8 ...) => (VFMADD213PD512 ...) -(FusedMultiplyAddMaskedFloat32x4 x y z mask) => (VFMADD213PSMasked128 x y z (VPMOVVec32x4ToM mask)) -(FusedMultiplyAddMaskedFloat32x8 x y z mask) => (VFMADD213PSMasked256 x y z (VPMOVVec32x8ToM mask)) -(FusedMultiplyAddMaskedFloat32x16 x y z mask) => (VFMADD213PSMasked512 x y z (VPMOVVec32x16ToM mask)) -(FusedMultiplyAddMaskedFloat64x2 x y z mask) => (VFMADD213PDMasked128 x y z (VPMOVVec64x2ToM mask)) -(FusedMultiplyAddMaskedFloat64x4 x y z mask) => (VFMADD213PDMasked256 x y z (VPMOVVec64x4ToM mask)) -(FusedMultiplyAddMaskedFloat64x8 x y z mask) => (VFMADD213PDMasked512 x y z (VPMOVVec64x8ToM mask)) -(FusedMultiplyAddSubFloat32x4 ...) => (VFMADDSUB213PS128 ...) -(FusedMultiplyAddSubFloat32x8 ...) => (VFMADDSUB213PS256 ...) -(FusedMultiplyAddSubFloat32x16 ...) => (VFMADDSUB213PS512 ...) -(FusedMultiplyAddSubFloat64x2 ...) => (VFMADDSUB213PD128 ...) -(FusedMultiplyAddSubFloat64x4 ...) => (VFMADDSUB213PD256 ...) -(FusedMultiplyAddSubFloat64x8 ...) => (VFMADDSUB213PD512 ...) -(FusedMultiplyAddSubMaskedFloat32x4 x y z mask) => (VFMADDSUB213PSMasked128 x y z (VPMOVVec32x4ToM mask)) -(FusedMultiplyAddSubMaskedFloat32x8 x y z mask) => (VFMADDSUB213PSMasked256 x y z (VPMOVVec32x8ToM mask)) -(FusedMultiplyAddSubMaskedFloat32x16 x y z mask) => (VFMADDSUB213PSMasked512 x y z (VPMOVVec32x16ToM mask)) -(FusedMultiplyAddSubMaskedFloat64x2 x y z mask) => (VFMADDSUB213PDMasked128 x y z (VPMOVVec64x2ToM mask)) -(FusedMultiplyAddSubMaskedFloat64x4 x y z mask) => (VFMADDSUB213PDMasked256 x y z (VPMOVVec64x4ToM mask)) -(FusedMultiplyAddSubMaskedFloat64x8 x y z mask) => (VFMADDSUB213PDMasked512 x y z (VPMOVVec64x8ToM mask)) -(FusedMultiplySubAddFloat32x4 ...) => (VFMSUBADD213PS128 ...) -(FusedMultiplySubAddFloat32x8 ...) => (VFMSUBADD213PS256 ...) -(FusedMultiplySubAddFloat32x16 ...) => (VFMSUBADD213PS512 ...) -(FusedMultiplySubAddFloat64x2 ...) => (VFMSUBADD213PD128 ...) -(FusedMultiplySubAddFloat64x4 ...) => (VFMSUBADD213PD256 ...) -(FusedMultiplySubAddFloat64x8 ...) => (VFMSUBADD213PD512 ...) -(FusedMultiplySubAddMaskedFloat32x4 x y z mask) => (VFMSUBADD213PSMasked128 x y z (VPMOVVec32x4ToM mask)) -(FusedMultiplySubAddMaskedFloat32x8 x y z mask) => (VFMSUBADD213PSMasked256 x y z (VPMOVVec32x8ToM mask)) -(FusedMultiplySubAddMaskedFloat32x16 x y z mask) => (VFMSUBADD213PSMasked512 x y z (VPMOVVec32x16ToM mask)) -(FusedMultiplySubAddMaskedFloat64x2 x y z mask) => (VFMSUBADD213PDMasked128 x y z (VPMOVVec64x2ToM mask)) -(FusedMultiplySubAddMaskedFloat64x4 x y z mask) => (VFMSUBADD213PDMasked256 x y z (VPMOVVec64x4ToM mask)) -(FusedMultiplySubAddMaskedFloat64x8 x y z mask) => (VFMSUBADD213PDMasked512 x y z (VPMOVVec64x8ToM mask)) (GaloisFieldAffineTransformUint8x16 ...) => (VGF2P8AFFINEQB128 ...) (GaloisFieldAffineTransformUint8x32 ...) => (VGF2P8AFFINEQB256 ...) (GaloisFieldAffineTransformUint8x64 ...) => (VGF2P8AFFINEQB512 ...) @@ -932,34 +899,49 @@ (MulInt64x2 ...) => (VPMULLQ128 ...) (MulInt64x4 ...) => (VPMULLQ256 ...) (MulInt64x8 ...) => (VPMULLQ512 ...) +(MulUint16x8 ...) => (VPMULLW128 ...) +(MulUint16x16 ...) => (VPMULLW256 ...) +(MulUint16x32 ...) => (VPMULLW512 ...) +(MulUint32x4 ...) => (VPMULLD128 ...) +(MulUint32x8 ...) => (VPMULLD256 ...) +(MulUint32x16 ...) => (VPMULLD512 ...) +(MulUint64x2 ...) => (VPMULLQ128 ...) +(MulUint64x4 ...) => (VPMULLQ256 ...) +(MulUint64x8 ...) => (VPMULLQ512 ...) +(MulAddFloat32x4 ...) => (VFMADD213PS128 ...) +(MulAddFloat32x8 ...) => (VFMADD213PS256 ...) +(MulAddFloat32x16 ...) => (VFMADD213PS512 ...) +(MulAddFloat64x2 ...) => (VFMADD213PD128 ...) +(MulAddFloat64x4 ...) => (VFMADD213PD256 ...) +(MulAddFloat64x8 ...) => (VFMADD213PD512 ...) +(MulAddMaskedFloat32x4 x y z mask) => (VFMADD213PSMasked128 x y z (VPMOVVec32x4ToM mask)) +(MulAddMaskedFloat32x8 x y z mask) => (VFMADD213PSMasked256 x y z (VPMOVVec32x8ToM mask)) +(MulAddMaskedFloat32x16 x y z mask) => (VFMADD213PSMasked512 x y z (VPMOVVec32x16ToM mask)) +(MulAddMaskedFloat64x2 x y z mask) => (VFMADD213PDMasked128 x y z (VPMOVVec64x2ToM mask)) +(MulAddMaskedFloat64x4 x y z mask) => (VFMADD213PDMasked256 x y z (VPMOVVec64x4ToM mask)) +(MulAddMaskedFloat64x8 x y z mask) => (VFMADD213PDMasked512 x y z (VPMOVVec64x8ToM mask)) +(MulAddSubFloat32x4 ...) => (VFMADDSUB213PS128 ...) +(MulAddSubFloat32x8 ...) => (VFMADDSUB213PS256 ...) +(MulAddSubFloat32x16 ...) => (VFMADDSUB213PS512 ...) +(MulAddSubFloat64x2 ...) => (VFMADDSUB213PD128 ...) +(MulAddSubFloat64x4 ...) => (VFMADDSUB213PD256 ...) +(MulAddSubFloat64x8 ...) => (VFMADDSUB213PD512 ...) +(MulAddSubMaskedFloat32x4 x y z mask) => (VFMADDSUB213PSMasked128 x y z (VPMOVVec32x4ToM mask)) +(MulAddSubMaskedFloat32x8 x y z mask) => (VFMADDSUB213PSMasked256 x y z (VPMOVVec32x8ToM mask)) +(MulAddSubMaskedFloat32x16 x y z mask) => (VFMADDSUB213PSMasked512 x y z (VPMOVVec32x16ToM mask)) +(MulAddSubMaskedFloat64x2 x y z mask) => (VFMADDSUB213PDMasked128 x y z (VPMOVVec64x2ToM mask)) +(MulAddSubMaskedFloat64x4 x y z mask) => (VFMADDSUB213PDMasked256 x y z (VPMOVVec64x4ToM mask)) +(MulAddSubMaskedFloat64x8 x y z mask) => (VFMADDSUB213PDMasked512 x y z (VPMOVVec64x8ToM mask)) (MulEvenWidenInt32x4 ...) => (VPMULDQ128 ...) (MulEvenWidenInt32x8 ...) => (VPMULDQ256 ...) -(MulEvenWidenInt64x2 ...) => (VPMULDQ128 ...) -(MulEvenWidenInt64x4 ...) => (VPMULDQ256 ...) -(MulEvenWidenInt64x8 ...) => (VPMULDQ512 ...) (MulEvenWidenUint32x4 ...) => (VPMULUDQ128 ...) (MulEvenWidenUint32x8 ...) => (VPMULUDQ256 ...) -(MulEvenWidenUint64x2 ...) => (VPMULUDQ128 ...) -(MulEvenWidenUint64x4 ...) => (VPMULUDQ256 ...) -(MulEvenWidenUint64x8 ...) => (VPMULUDQ512 ...) -(MulEvenWidenMaskedInt64x2 x y mask) => (VPMULDQMasked128 x y (VPMOVVec64x2ToM mask)) -(MulEvenWidenMaskedInt64x4 x y mask) => (VPMULDQMasked256 x y (VPMOVVec64x4ToM mask)) -(MulEvenWidenMaskedInt64x8 x y mask) => (VPMULDQMasked512 x y (VPMOVVec64x8ToM mask)) -(MulEvenWidenMaskedUint64x2 x y mask) => (VPMULUDQMasked128 x y (VPMOVVec64x2ToM mask)) -(MulEvenWidenMaskedUint64x4 x y mask) => (VPMULUDQMasked256 x y (VPMOVVec64x4ToM mask)) -(MulEvenWidenMaskedUint64x8 x y mask) => (VPMULUDQMasked512 x y (VPMOVVec64x8ToM mask)) -(MulHighInt16x8 ...) => (VPMULHW128 ...) -(MulHighInt16x16 ...) => (VPMULHW256 ...) +(MulHighInt16x8 ...) => (VPMULHUW128 ...) +(MulHighInt16x16 ...) => (VPMULHUW256 ...) (MulHighInt16x32 ...) => (VPMULHW512 ...) -(MulHighUint16x8 ...) => (VPMULHUW128 ...) -(MulHighUint16x16 ...) => (VPMULHUW256 ...) -(MulHighUint16x32 ...) => (VPMULHUW512 ...) -(MulHighMaskedInt16x8 x y mask) => (VPMULHWMasked128 x y (VPMOVVec16x8ToM mask)) +(MulHighMaskedInt16x8 x y mask) => (VPMULHUWMasked128 x y (VPMOVVec16x8ToM mask)) (MulHighMaskedInt16x16 x y mask) => (VPMULHWMasked256 x y (VPMOVVec16x16ToM mask)) -(MulHighMaskedInt16x32 x y mask) => (VPMULHWMasked512 x y (VPMOVVec16x32ToM mask)) -(MulHighMaskedUint16x8 x y mask) => (VPMULHUWMasked128 x y (VPMOVVec16x8ToM mask)) -(MulHighMaskedUint16x16 x y mask) => (VPMULHUWMasked256 x y (VPMOVVec16x16ToM mask)) -(MulHighMaskedUint16x32 x y mask) => (VPMULHUWMasked512 x y (VPMOVVec16x32ToM mask)) +(MulHighMaskedInt16x32 x y mask) => (VPMULHUWMasked512 x y (VPMOVVec16x32ToM mask)) (MulMaskedFloat32x4 x y mask) => (VMULPSMasked128 x y (VPMOVVec32x4ToM mask)) (MulMaskedFloat32x8 x y mask) => (VMULPSMasked256 x y (VPMOVVec32x8ToM mask)) (MulMaskedFloat32x16 x y mask) => (VMULPSMasked512 x y (VPMOVVec32x16ToM mask)) @@ -975,6 +957,27 @@ (MulMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM mask)) (MulMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM mask)) (MulMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM mask)) +(MulMaskedUint16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM mask)) +(MulMaskedUint16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM mask)) +(MulMaskedUint16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM mask)) +(MulMaskedUint32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM mask)) +(MulMaskedUint32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM mask)) +(MulMaskedUint32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM mask)) +(MulMaskedUint64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM mask)) +(MulMaskedUint64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM mask)) +(MulMaskedUint64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM mask)) +(MulSubAddFloat32x4 ...) => (VFMSUBADD213PS128 ...) +(MulSubAddFloat32x8 ...) => (VFMSUBADD213PS256 ...) +(MulSubAddFloat32x16 ...) => (VFMSUBADD213PS512 ...) +(MulSubAddFloat64x2 ...) => (VFMSUBADD213PD128 ...) +(MulSubAddFloat64x4 ...) => (VFMSUBADD213PD256 ...) +(MulSubAddFloat64x8 ...) => (VFMSUBADD213PD512 ...) +(MulSubAddMaskedFloat32x4 x y z mask) => (VFMSUBADD213PSMasked128 x y z (VPMOVVec32x4ToM mask)) +(MulSubAddMaskedFloat32x8 x y z mask) => (VFMSUBADD213PSMasked256 x y z (VPMOVVec32x8ToM mask)) +(MulSubAddMaskedFloat32x16 x y z mask) => (VFMSUBADD213PSMasked512 x y z (VPMOVVec32x16ToM mask)) +(MulSubAddMaskedFloat64x2 x y z mask) => (VFMSUBADD213PDMasked128 x y z (VPMOVVec64x2ToM mask)) +(MulSubAddMaskedFloat64x4 x y z mask) => (VFMSUBADD213PDMasked256 x y z (VPMOVVec64x4ToM mask)) +(MulSubAddMaskedFloat64x8 x y z mask) => (VFMSUBADD213PDMasked512 x y z (VPMOVVec64x8ToM mask)) (NotEqualFloat32x4 x y) => (VCMPPS128 [4] x y) (NotEqualFloat32x8 x y) => (VCMPPS256 [4] x y) (NotEqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [4] x y)) @@ -1035,6 +1038,54 @@ (NotEqualMaskedUint64x2 x y mask) => (VPMOVMToVec64x2 (VPCMPUQMasked128 [4] x y (VPMOVVec64x2ToM mask))) (NotEqualMaskedUint64x4 x y mask) => (VPMOVMToVec64x4 (VPCMPUQMasked256 [4] x y (VPMOVVec64x4ToM mask))) (NotEqualMaskedUint64x8 x y mask) => (VPMOVMToVec64x8 (VPCMPUQMasked512 [4] x y (VPMOVVec64x8ToM mask))) +(OnesCountInt8x16 ...) => (VPOPCNTB128 ...) +(OnesCountInt8x32 ...) => (VPOPCNTB256 ...) +(OnesCountInt8x64 ...) => (VPOPCNTB512 ...) +(OnesCountInt16x8 ...) => (VPOPCNTW128 ...) +(OnesCountInt16x16 ...) => (VPOPCNTW256 ...) +(OnesCountInt16x32 ...) => (VPOPCNTW512 ...) +(OnesCountInt32x4 ...) => (VPOPCNTD128 ...) +(OnesCountInt32x8 ...) => (VPOPCNTD256 ...) +(OnesCountInt32x16 ...) => (VPOPCNTD512 ...) +(OnesCountInt64x2 ...) => (VPOPCNTQ128 ...) +(OnesCountInt64x4 ...) => (VPOPCNTQ256 ...) +(OnesCountInt64x8 ...) => (VPOPCNTQ512 ...) +(OnesCountUint8x16 ...) => (VPOPCNTB128 ...) +(OnesCountUint8x32 ...) => (VPOPCNTB256 ...) +(OnesCountUint8x64 ...) => (VPOPCNTB512 ...) +(OnesCountUint16x8 ...) => (VPOPCNTW128 ...) +(OnesCountUint16x16 ...) => (VPOPCNTW256 ...) +(OnesCountUint16x32 ...) => (VPOPCNTW512 ...) +(OnesCountUint32x4 ...) => (VPOPCNTD128 ...) +(OnesCountUint32x8 ...) => (VPOPCNTD256 ...) +(OnesCountUint32x16 ...) => (VPOPCNTD512 ...) +(OnesCountUint64x2 ...) => (VPOPCNTQ128 ...) +(OnesCountUint64x4 ...) => (VPOPCNTQ256 ...) +(OnesCountUint64x8 ...) => (VPOPCNTQ512 ...) +(OnesCountMaskedInt8x16 x mask) => (VPOPCNTBMasked128 x (VPMOVVec8x16ToM mask)) +(OnesCountMaskedInt8x32 x mask) => (VPOPCNTBMasked256 x (VPMOVVec8x32ToM mask)) +(OnesCountMaskedInt8x64 x mask) => (VPOPCNTBMasked512 x (VPMOVVec8x64ToM mask)) +(OnesCountMaskedInt16x8 x mask) => (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) +(OnesCountMaskedInt16x16 x mask) => (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) +(OnesCountMaskedInt16x32 x mask) => (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) +(OnesCountMaskedInt32x4 x mask) => (VPOPCNTDMasked128 x (VPMOVVec32x4ToM mask)) +(OnesCountMaskedInt32x8 x mask) => (VPOPCNTDMasked256 x (VPMOVVec32x8ToM mask)) +(OnesCountMaskedInt32x16 x mask) => (VPOPCNTDMasked512 x (VPMOVVec32x16ToM mask)) +(OnesCountMaskedInt64x2 x mask) => (VPOPCNTQMasked128 x (VPMOVVec64x2ToM mask)) +(OnesCountMaskedInt64x4 x mask) => (VPOPCNTQMasked256 x (VPMOVVec64x4ToM mask)) +(OnesCountMaskedInt64x8 x mask) => (VPOPCNTQMasked512 x (VPMOVVec64x8ToM mask)) +(OnesCountMaskedUint8x16 x mask) => (VPOPCNTBMasked128 x (VPMOVVec8x16ToM mask)) +(OnesCountMaskedUint8x32 x mask) => (VPOPCNTBMasked256 x (VPMOVVec8x32ToM mask)) +(OnesCountMaskedUint8x64 x mask) => (VPOPCNTBMasked512 x (VPMOVVec8x64ToM mask)) +(OnesCountMaskedUint16x8 x mask) => (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) +(OnesCountMaskedUint16x16 x mask) => (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) +(OnesCountMaskedUint16x32 x mask) => (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) +(OnesCountMaskedUint32x4 x mask) => (VPOPCNTDMasked128 x (VPMOVVec32x4ToM mask)) +(OnesCountMaskedUint32x8 x mask) => (VPOPCNTDMasked256 x (VPMOVVec32x8ToM mask)) +(OnesCountMaskedUint32x16 x mask) => (VPOPCNTDMasked512 x (VPMOVVec32x16ToM mask)) +(OnesCountMaskedUint64x2 x mask) => (VPOPCNTQMasked128 x (VPMOVVec64x2ToM mask)) +(OnesCountMaskedUint64x4 x mask) => (VPOPCNTQMasked256 x (VPMOVVec64x4ToM mask)) +(OnesCountMaskedUint64x8 x mask) => (VPOPCNTQMasked512 x (VPMOVVec64x8ToM mask)) (OrInt8x16 ...) => (VPOR128 ...) (OrInt8x32 ...) => (VPOR256 ...) (OrInt8x64 ...) => (VPORD512 ...) @@ -1071,12 +1122,6 @@ (OrMaskedUint64x2 x y mask) => (VPORQMasked128 x y (VPMOVVec64x2ToM mask)) (OrMaskedUint64x4 x y mask) => (VPORQMasked256 x y (VPMOVVec64x4ToM mask)) (OrMaskedUint64x8 x y mask) => (VPORQMasked512 x y (VPMOVVec64x8ToM mask)) -(PairDotProdInt16x8 ...) => (VPMADDWD128 ...) -(PairDotProdInt16x16 ...) => (VPMADDWD256 ...) -(PairDotProdInt16x32 ...) => (VPMADDWD512 ...) -(PairDotProdMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM mask)) -(PairDotProdMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM mask)) -(PairDotProdMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM mask)) (PermuteFloat32x8 ...) => (VPERMPS256 ...) (PermuteFloat32x16 ...) => (VPERMPS512 ...) (PermuteFloat64x4 ...) => (VPERMPD256 ...) @@ -1185,54 +1230,30 @@ (PermuteMaskedUint32x16 x y mask) => (VPERMDMasked512 x y (VPMOVVec32x16ToM mask)) (PermuteMaskedUint64x4 x y mask) => (VPERMQMasked256 x y (VPMOVVec64x4ToM mask)) (PermuteMaskedUint64x8 x y mask) => (VPERMQMasked512 x y (VPMOVVec64x8ToM mask)) -(PopCountInt8x16 ...) => (VPOPCNTB128 ...) -(PopCountInt8x32 ...) => (VPOPCNTB256 ...) -(PopCountInt8x64 ...) => (VPOPCNTB512 ...) -(PopCountInt16x8 ...) => (VPOPCNTW128 ...) -(PopCountInt16x16 ...) => (VPOPCNTW256 ...) -(PopCountInt16x32 ...) => (VPOPCNTW512 ...) -(PopCountInt32x4 ...) => (VPOPCNTD128 ...) -(PopCountInt32x8 ...) => (VPOPCNTD256 ...) -(PopCountInt32x16 ...) => (VPOPCNTD512 ...) -(PopCountInt64x2 ...) => (VPOPCNTQ128 ...) -(PopCountInt64x4 ...) => (VPOPCNTQ256 ...) -(PopCountInt64x8 ...) => (VPOPCNTQ512 ...) -(PopCountUint8x16 ...) => (VPOPCNTB128 ...) -(PopCountUint8x32 ...) => (VPOPCNTB256 ...) -(PopCountUint8x64 ...) => (VPOPCNTB512 ...) -(PopCountUint16x8 ...) => (VPOPCNTW128 ...) -(PopCountUint16x16 ...) => (VPOPCNTW256 ...) -(PopCountUint16x32 ...) => (VPOPCNTW512 ...) -(PopCountUint32x4 ...) => (VPOPCNTD128 ...) -(PopCountUint32x8 ...) => (VPOPCNTD256 ...) -(PopCountUint32x16 ...) => (VPOPCNTD512 ...) -(PopCountUint64x2 ...) => (VPOPCNTQ128 ...) -(PopCountUint64x4 ...) => (VPOPCNTQ256 ...) -(PopCountUint64x8 ...) => (VPOPCNTQ512 ...) -(PopCountMaskedInt8x16 x mask) => (VPOPCNTBMasked128 x (VPMOVVec8x16ToM mask)) -(PopCountMaskedInt8x32 x mask) => (VPOPCNTBMasked256 x (VPMOVVec8x32ToM mask)) -(PopCountMaskedInt8x64 x mask) => (VPOPCNTBMasked512 x (VPMOVVec8x64ToM mask)) -(PopCountMaskedInt16x8 x mask) => (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) -(PopCountMaskedInt16x16 x mask) => (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) -(PopCountMaskedInt16x32 x mask) => (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) -(PopCountMaskedInt32x4 x mask) => (VPOPCNTDMasked128 x (VPMOVVec32x4ToM mask)) -(PopCountMaskedInt32x8 x mask) => (VPOPCNTDMasked256 x (VPMOVVec32x8ToM mask)) -(PopCountMaskedInt32x16 x mask) => (VPOPCNTDMasked512 x (VPMOVVec32x16ToM mask)) -(PopCountMaskedInt64x2 x mask) => (VPOPCNTQMasked128 x (VPMOVVec64x2ToM mask)) -(PopCountMaskedInt64x4 x mask) => (VPOPCNTQMasked256 x (VPMOVVec64x4ToM mask)) -(PopCountMaskedInt64x8 x mask) => (VPOPCNTQMasked512 x (VPMOVVec64x8ToM mask)) -(PopCountMaskedUint8x16 x mask) => (VPOPCNTBMasked128 x (VPMOVVec8x16ToM mask)) -(PopCountMaskedUint8x32 x mask) => (VPOPCNTBMasked256 x (VPMOVVec8x32ToM mask)) -(PopCountMaskedUint8x64 x mask) => (VPOPCNTBMasked512 x (VPMOVVec8x64ToM mask)) -(PopCountMaskedUint16x8 x mask) => (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) -(PopCountMaskedUint16x16 x mask) => (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) -(PopCountMaskedUint16x32 x mask) => (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) -(PopCountMaskedUint32x4 x mask) => (VPOPCNTDMasked128 x (VPMOVVec32x4ToM mask)) -(PopCountMaskedUint32x8 x mask) => (VPOPCNTDMasked256 x (VPMOVVec32x8ToM mask)) -(PopCountMaskedUint32x16 x mask) => (VPOPCNTDMasked512 x (VPMOVVec32x16ToM mask)) -(PopCountMaskedUint64x2 x mask) => (VPOPCNTQMasked128 x (VPMOVVec64x2ToM mask)) -(PopCountMaskedUint64x4 x mask) => (VPOPCNTQMasked256 x (VPMOVVec64x4ToM mask)) -(PopCountMaskedUint64x8 x mask) => (VPOPCNTQMasked512 x (VPMOVVec64x8ToM mask)) +(ReciprocalFloat32x4 ...) => (VRCPPS128 ...) +(ReciprocalFloat32x8 ...) => (VRCPPS256 ...) +(ReciprocalFloat32x16 ...) => (VRCP14PS512 ...) +(ReciprocalFloat64x2 ...) => (VRCP14PD128 ...) +(ReciprocalFloat64x4 ...) => (VRCP14PD256 ...) +(ReciprocalFloat64x8 ...) => (VRCP14PD512 ...) +(ReciprocalMaskedFloat32x4 x mask) => (VRCP14PSMasked128 x (VPMOVVec32x4ToM mask)) +(ReciprocalMaskedFloat32x8 x mask) => (VRCP14PSMasked256 x (VPMOVVec32x8ToM mask)) +(ReciprocalMaskedFloat32x16 x mask) => (VRCP14PSMasked512 x (VPMOVVec32x16ToM mask)) +(ReciprocalMaskedFloat64x2 x mask) => (VRCP14PDMasked128 x (VPMOVVec64x2ToM mask)) +(ReciprocalMaskedFloat64x4 x mask) => (VRCP14PDMasked256 x (VPMOVVec64x4ToM mask)) +(ReciprocalMaskedFloat64x8 x mask) => (VRCP14PDMasked512 x (VPMOVVec64x8ToM mask)) +(ReciprocalSqrtFloat32x4 ...) => (VRSQRTPS128 ...) +(ReciprocalSqrtFloat32x8 ...) => (VRSQRTPS256 ...) +(ReciprocalSqrtFloat32x16 ...) => (VRSQRT14PS512 ...) +(ReciprocalSqrtFloat64x2 ...) => (VRSQRT14PD128 ...) +(ReciprocalSqrtFloat64x4 ...) => (VRSQRT14PD256 ...) +(ReciprocalSqrtFloat64x8 ...) => (VRSQRT14PD512 ...) +(ReciprocalSqrtMaskedFloat32x4 x mask) => (VRSQRT14PSMasked128 x (VPMOVVec32x4ToM mask)) +(ReciprocalSqrtMaskedFloat32x8 x mask) => (VRSQRT14PSMasked256 x (VPMOVVec32x8ToM mask)) +(ReciprocalSqrtMaskedFloat32x16 x mask) => (VRSQRT14PSMasked512 x (VPMOVVec32x16ToM mask)) +(ReciprocalSqrtMaskedFloat64x2 x mask) => (VRSQRT14PDMasked128 x (VPMOVVec64x2ToM mask)) +(ReciprocalSqrtMaskedFloat64x4 x mask) => (VRSQRT14PDMasked256 x (VPMOVVec64x4ToM mask)) +(ReciprocalSqrtMaskedFloat64x8 x mask) => (VRSQRT14PDMasked512 x (VPMOVVec64x8ToM mask)) (RotateAllLeftInt32x4 ...) => (VPROLD128 ...) (RotateAllLeftInt32x8 ...) => (VPROLD256 ...) (RotateAllLeftInt32x16 ...) => (VPROLD512 ...) @@ -1329,52 +1350,34 @@ (RotateRightMaskedUint64x2 x y mask) => (VPRORVQMasked128 x y (VPMOVVec64x2ToM mask)) (RotateRightMaskedUint64x4 x y mask) => (VPRORVQMasked256 x y (VPMOVVec64x4ToM mask)) (RotateRightMaskedUint64x8 x y mask) => (VPRORVQMasked512 x y (VPMOVVec64x8ToM mask)) -(RoundFloat32x4 x) => (VROUNDPS128 [0] x) -(RoundFloat32x8 x) => (VROUNDPS256 [0] x) -(RoundFloat64x2 x) => (VROUNDPD128 [0] x) -(RoundFloat64x4 x) => (VROUNDPD256 [0] x) -(RoundScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x) -(RoundScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x) -(RoundScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x) -(RoundScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x) -(RoundScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x) -(RoundScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x) -(RoundScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) -(RoundScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) -(RoundScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) -(RoundScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) -(RoundScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) -(RoundScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) -(RoundScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x) -(RoundScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x) -(RoundScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x) -(RoundScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x) -(RoundScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x) -(RoundScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x) -(RoundScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) -(RoundScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) -(RoundScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) -(RoundScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) -(RoundScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) -(RoundScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) -(SaturatedAddDotProdInt32x4 ...) => (VPDPWSSDS128 ...) -(SaturatedAddDotProdInt32x8 ...) => (VPDPWSSDS256 ...) -(SaturatedAddDotProdInt32x16 ...) => (VPDPWSSDS512 ...) -(SaturatedAddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) -(SaturatedAddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) -(SaturatedAddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) -(SaturatedUnsignedSignedPairDotProdUint8x16 ...) => (VPMADDUBSW128 ...) -(SaturatedUnsignedSignedPairDotProdUint8x32 ...) => (VPMADDUBSW256 ...) -(SaturatedUnsignedSignedPairDotProdUint8x64 ...) => (VPMADDUBSW512 ...) -(SaturatedUnsignedSignedPairDotProdMaskedUint8x16 x y mask) => (VPMADDUBSWMasked128 x y (VPMOVVec16x8ToM mask)) -(SaturatedUnsignedSignedPairDotProdMaskedUint8x32 x y mask) => (VPMADDUBSWMasked256 x y (VPMOVVec16x16ToM mask)) -(SaturatedUnsignedSignedPairDotProdMaskedUint8x64 x y mask) => (VPMADDUBSWMasked512 x y (VPMOVVec16x32ToM mask)) -(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSDS128 ...) -(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSDS256 ...) -(SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSDS512 ...) -(SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) -(SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) -(SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) +(RoundToEvenFloat32x4 x) => (VROUNDPS128 [0] x) +(RoundToEvenFloat32x8 x) => (VROUNDPS256 [0] x) +(RoundToEvenFloat64x2 x) => (VROUNDPD128 [0] x) +(RoundToEvenFloat64x4 x) => (VROUNDPD256 [0] x) +(RoundToEvenScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x) +(RoundToEvenScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x) +(RoundToEvenScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x) +(RoundToEvenScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x) +(RoundToEvenScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x) +(RoundToEvenScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x) +(RoundToEvenScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) +(RoundToEvenScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) +(RoundToEvenScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) +(RoundToEvenScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) +(RoundToEvenScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) +(RoundToEvenScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) +(RoundToEvenScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x) +(RoundToEvenScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x) +(RoundToEvenScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x) +(RoundToEvenScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x) +(RoundToEvenScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x) +(RoundToEvenScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x) +(RoundToEvenScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) +(RoundToEvenScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) +(RoundToEvenScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) +(RoundToEvenScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) +(RoundToEvenScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) +(RoundToEvenScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) (ScaleFloat32x4 ...) => (VSCALEFPS128 ...) (ScaleFloat32x8 ...) => (VSCALEFPS256 ...) (ScaleFloat32x16 ...) => (VSCALEFPS512 ...) @@ -1795,12 +1798,6 @@ (ShiftRightMaskedUint64x2 x y mask) => (VPSRLVQMasked128 x y (VPMOVVec64x2ToM mask)) (ShiftRightMaskedUint64x4 x y mask) => (VPSRLVQMasked256 x y (VPMOVVec64x4ToM mask)) (ShiftRightMaskedUint64x8 x y mask) => (VPSRLVQMasked512 x y (VPMOVVec64x8ToM mask)) -(SignInt8x16 ...) => (VPSIGNB128 ...) -(SignInt8x32 ...) => (VPSIGNB256 ...) -(SignInt16x8 ...) => (VPSIGNW128 ...) -(SignInt16x16 ...) => (VPSIGNW256 ...) -(SignInt32x4 ...) => (VPSIGND128 ...) -(SignInt32x8 ...) => (VPSIGND256 ...) (SqrtFloat32x4 ...) => (VSQRTPS128 ...) (SqrtFloat32x8 ...) => (VSQRTPS256 ...) (SqrtFloat32x16 ...) => (VSQRTPS512 ...) @@ -1893,24 +1890,24 @@ (SubSaturatedInt16x8 ...) => (VPSUBSW128 ...) (SubSaturatedInt16x16 ...) => (VPSUBSW256 ...) (SubSaturatedInt16x32 ...) => (VPSUBSW512 ...) -(SubSaturatedUint8x16 ...) => (VPSUBSB128 ...) -(SubSaturatedUint8x32 ...) => (VPSUBSB256 ...) -(SubSaturatedUint8x64 ...) => (VPSUBSB512 ...) -(SubSaturatedUint16x8 ...) => (VPSUBSW128 ...) -(SubSaturatedUint16x16 ...) => (VPSUBSW256 ...) -(SubSaturatedUint16x32 ...) => (VPSUBSW512 ...) +(SubSaturatedUint8x16 ...) => (VPSUBUSB128 ...) +(SubSaturatedUint8x32 ...) => (VPSUBUSB256 ...) +(SubSaturatedUint8x64 ...) => (VPSUBUSB512 ...) +(SubSaturatedUint16x8 ...) => (VPSUBUSW128 ...) +(SubSaturatedUint16x16 ...) => (VPSUBUSW256 ...) +(SubSaturatedUint16x32 ...) => (VPSUBUSW512 ...) (SubSaturatedMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) (SubSaturatedMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) (SubSaturatedMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) (SubSaturatedMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) (SubSaturatedMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) (SubSaturatedMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) -(SubSaturatedMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) -(SubSaturatedMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) -(SubSaturatedMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) -(SubSaturatedMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) -(SubSaturatedMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) -(SubSaturatedMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) +(SubSaturatedMaskedUint8x16 x y mask) => (VPSUBUSBMasked128 x y (VPMOVVec8x16ToM mask)) +(SubSaturatedMaskedUint8x32 x y mask) => (VPSUBUSBMasked256 x y (VPMOVVec8x32ToM mask)) +(SubSaturatedMaskedUint8x64 x y mask) => (VPSUBUSBMasked512 x y (VPMOVVec8x64ToM mask)) +(SubSaturatedMaskedUint16x8 x y mask) => (VPSUBUSWMasked128 x y (VPMOVVec16x8ToM mask)) +(SubSaturatedMaskedUint16x16 x y mask) => (VPSUBUSWMasked256 x y (VPMOVVec16x16ToM mask)) +(SubSaturatedMaskedUint16x32 x y mask) => (VPSUBUSWMasked512 x y (VPMOVVec16x32ToM mask)) (TruncFloat32x4 x) => (VROUNDPS128 [3] x) (TruncFloat32x8 x) => (VROUNDPS256 [3] x) (TruncFloat64x2 x) => (VROUNDPD128 [3] x) @@ -1939,12 +1936,6 @@ (TruncScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) (TruncScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) (TruncScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) -(UnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSD128 ...) -(UnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSD256 ...) -(UnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSD512 ...) -(UnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) -(UnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) -(UnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) (XorInt8x16 ...) => (VPXOR128 ...) (XorInt8x32 ...) => (VPXOR256 ...) (XorInt8x64 ...) => (VPXORD512 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index ccda39f59d..665372f79d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -195,6 +195,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPADDSWMasked128", argLength: 3, reg: w2kw, asm: "VPADDSW", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPADDSWMasked256", argLength: 3, reg: w2kw, asm: "VPADDSW", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPADDSWMasked512", argLength: 3, reg: w2kw, asm: "VPADDSW", commutative: true, typ: "Vec512", resultInArg0: false}, + {name: "VPADDUSB128", argLength: 2, reg: v21, asm: "VPADDUSB", commutative: true, typ: "Vec128", resultInArg0: false}, + {name: "VPADDUSB256", argLength: 2, reg: v21, asm: "VPADDUSB", commutative: true, typ: "Vec256", resultInArg0: false}, + {name: "VPADDUSB512", argLength: 2, reg: w21, asm: "VPADDUSB", commutative: true, typ: "Vec512", resultInArg0: false}, + {name: "VPADDUSBMasked128", argLength: 3, reg: w2kw, asm: "VPADDUSB", commutative: true, typ: "Vec128", resultInArg0: false}, + {name: "VPADDUSBMasked256", argLength: 3, reg: w2kw, asm: "VPADDUSB", commutative: true, typ: "Vec256", resultInArg0: false}, + {name: "VPADDUSBMasked512", argLength: 3, reg: w2kw, asm: "VPADDUSB", commutative: true, typ: "Vec512", resultInArg0: false}, + {name: "VPADDUSW128", argLength: 2, reg: v21, asm: "VPADDUSW", commutative: true, typ: "Vec128", resultInArg0: false}, + {name: "VPADDUSW256", argLength: 2, reg: v21, asm: "VPADDUSW", commutative: true, typ: "Vec256", resultInArg0: false}, + {name: "VPADDUSW512", argLength: 2, reg: w21, asm: "VPADDUSW", commutative: true, typ: "Vec512", resultInArg0: false}, + {name: "VPADDUSWMasked128", argLength: 3, reg: w2kw, asm: "VPADDUSW", commutative: true, typ: "Vec128", resultInArg0: false}, + {name: "VPADDUSWMasked256", argLength: 3, reg: w2kw, asm: "VPADDUSW", commutative: true, typ: "Vec256", resultInArg0: false}, + {name: "VPADDUSWMasked512", argLength: 3, reg: w2kw, asm: "VPADDUSW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPADDW128", argLength: 2, reg: v21, asm: "VPADDW", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPADDW256", argLength: 2, reg: v21, asm: "VPADDW", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPADDW512", argLength: 2, reg: w21, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false}, @@ -497,22 +509,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPMINUWMasked512", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMULDQ128", argLength: 2, reg: v21, asm: "VPMULDQ", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULDQ256", argLength: 2, reg: v21, asm: "VPMULDQ", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VPMULDQ512", argLength: 2, reg: w21, asm: "VPMULDQ", commutative: true, typ: "Vec512", resultInArg0: false}, - {name: "VPMULDQMasked128", argLength: 3, reg: w2kw, asm: "VPMULDQ", commutative: true, typ: "Vec128", resultInArg0: false}, - {name: "VPMULDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULDQ", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VPMULDQMasked512", argLength: 3, reg: w2kw, asm: "VPMULDQ", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMULHUW128", argLength: 2, reg: v21, asm: "VPMULHUW", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULHUW256", argLength: 2, reg: v21, asm: "VPMULHUW", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VPMULHUW512", argLength: 2, reg: w21, asm: "VPMULHUW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMULHUWMasked128", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec128", resultInArg0: false}, - {name: "VPMULHUWMasked256", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULHUWMasked512", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec512", resultInArg0: false}, - {name: "VPMULHW128", argLength: 2, reg: v21, asm: "VPMULHW", commutative: true, typ: "Vec128", resultInArg0: false}, - {name: "VPMULHW256", argLength: 2, reg: v21, asm: "VPMULHW", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULHW512", argLength: 2, reg: w21, asm: "VPMULHW", commutative: true, typ: "Vec512", resultInArg0: false}, - {name: "VPMULHWMasked128", argLength: 3, reg: w2kw, asm: "VPMULHW", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULHWMasked256", argLength: 3, reg: w2kw, asm: "VPMULHW", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VPMULHWMasked512", argLength: 3, reg: w2kw, asm: "VPMULHW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMULLD128", argLength: 2, reg: v21, asm: "VPMULLD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULLD256", argLength: 2, reg: v21, asm: "VPMULLD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULLD512", argLength: 2, reg: w21, asm: "VPMULLD", commutative: true, typ: "Vec512", resultInArg0: false}, @@ -533,10 +535,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPMULLWMasked512", argLength: 3, reg: w2kw, asm: "VPMULLW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VPMULUDQ512", argLength: 2, reg: w21, asm: "VPMULUDQ", commutative: true, typ: "Vec512", resultInArg0: false}, - {name: "VPMULUDQMasked128", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false}, - {name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VPMULUDQMasked512", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPOPCNTB128", argLength: 1, reg: w11, asm: "VPOPCNTB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPOPCNTB256", argLength: 1, reg: w11, asm: "VPOPCNTB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPOPCNTB512", argLength: 1, reg: w11, asm: "VPOPCNTB", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -775,6 +773,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSUBSWMasked128", argLength: 3, reg: w2kw, asm: "VPSUBSW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSUBSWMasked256", argLength: 3, reg: w2kw, asm: "VPSUBSW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSUBSWMasked512", argLength: 3, reg: w2kw, asm: "VPSUBSW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSUBUSB128", argLength: 2, reg: v21, asm: "VPSUBUSB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSUBUSB256", argLength: 2, reg: v21, asm: "VPSUBUSB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSUBUSB512", argLength: 2, reg: w21, asm: "VPSUBUSB", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSUBUSBMasked128", argLength: 3, reg: w2kw, asm: "VPSUBUSB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSUBUSBMasked256", argLength: 3, reg: w2kw, asm: "VPSUBUSB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSUBUSBMasked512", argLength: 3, reg: w2kw, asm: "VPSUBUSB", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSUBUSW128", argLength: 2, reg: v21, asm: "VPSUBUSW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSUBUSW256", argLength: 2, reg: v21, asm: "VPSUBUSW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSUBUSW512", argLength: 2, reg: w21, asm: "VPSUBUSW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSUBUSWMasked128", argLength: 3, reg: w2kw, asm: "VPSUBUSW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSUBUSWMasked256", argLength: 3, reg: w2kw, asm: "VPSUBUSW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSUBUSWMasked512", argLength: 3, reg: w2kw, asm: "VPSUBUSW", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSUBW128", argLength: 2, reg: v21, asm: "VPSUBW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSUBW256", argLength: 2, reg: v21, asm: "VPSUBW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSUBW512", argLength: 2, reg: w21, asm: "VPSUBW", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -879,9 +889,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VREDUCEPDMasked128", argLength: 2, reg: wkw, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VREDUCEPDMasked256", argLength: 2, reg: wkw, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VREDUCEPDMasked512", argLength: 2, reg: wkw, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, - {name: "VDPPS128", argLength: 2, reg: v21, asm: "VDPPS", aux: "Int8", commutative: true, typ: "Vec128", resultInArg0: false}, - {name: "VDPPS256", argLength: 2, reg: v21, asm: "VDPPS", aux: "Int8", commutative: true, typ: "Vec256", resultInArg0: false}, - {name: "VDPPD128", argLength: 2, reg: v21, asm: "VDPPD", aux: "Int8", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VCMPPS128", argLength: 2, reg: v21, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VCMPPS256", argLength: 2, reg: v21, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VCMPPS512", argLength: 2, reg: w2k, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index d0a4a494b1..45c62f95a7 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -3,36 +3,48 @@ package main func simdGenericOps() []opData { return []opData{ - {name: "AbsoluteInt8x16", argLength: 1, commutative: false}, - {name: "AbsoluteInt8x32", argLength: 1, commutative: false}, - {name: "AbsoluteInt8x64", argLength: 1, commutative: false}, - {name: "AbsoluteInt16x8", argLength: 1, commutative: false}, - {name: "AbsoluteInt16x16", argLength: 1, commutative: false}, - {name: "AbsoluteInt16x32", argLength: 1, commutative: false}, - {name: "AbsoluteInt32x4", argLength: 1, commutative: false}, - {name: "AbsoluteInt32x8", argLength: 1, commutative: false}, - {name: "AbsoluteInt32x16", argLength: 1, commutative: false}, - {name: "AbsoluteInt64x2", argLength: 1, commutative: false}, - {name: "AbsoluteInt64x4", argLength: 1, commutative: false}, - {name: "AbsoluteInt64x8", argLength: 1, commutative: false}, - {name: "AbsoluteMaskedInt8x16", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt8x32", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt8x64", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt16x8", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt16x16", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt16x32", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt32x4", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt32x8", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt32x16", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt64x2", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt64x4", argLength: 2, commutative: false}, - {name: "AbsoluteMaskedInt64x8", argLength: 2, commutative: false}, - {name: "AddDotProdInt32x4", argLength: 3, commutative: false}, - {name: "AddDotProdInt32x8", argLength: 3, commutative: false}, - {name: "AddDotProdInt32x16", argLength: 3, commutative: false}, - {name: "AddDotProdMaskedInt32x4", argLength: 4, commutative: false}, - {name: "AddDotProdMaskedInt32x8", argLength: 4, commutative: false}, - {name: "AddDotProdMaskedInt32x16", argLength: 4, commutative: false}, + {name: "AbsInt8x16", argLength: 1, commutative: false}, + {name: "AbsInt8x32", argLength: 1, commutative: false}, + {name: "AbsInt8x64", argLength: 1, commutative: false}, + {name: "AbsInt16x8", argLength: 1, commutative: false}, + {name: "AbsInt16x16", argLength: 1, commutative: false}, + {name: "AbsInt16x32", argLength: 1, commutative: false}, + {name: "AbsInt32x4", argLength: 1, commutative: false}, + {name: "AbsInt32x8", argLength: 1, commutative: false}, + {name: "AbsInt32x16", argLength: 1, commutative: false}, + {name: "AbsInt64x2", argLength: 1, commutative: false}, + {name: "AbsInt64x4", argLength: 1, commutative: false}, + {name: "AbsInt64x8", argLength: 1, commutative: false}, + {name: "AbsMaskedInt8x16", argLength: 2, commutative: false}, + {name: "AbsMaskedInt8x32", argLength: 2, commutative: false}, + {name: "AbsMaskedInt8x64", argLength: 2, commutative: false}, + {name: "AbsMaskedInt16x8", argLength: 2, commutative: false}, + {name: "AbsMaskedInt16x16", argLength: 2, commutative: false}, + {name: "AbsMaskedInt16x32", argLength: 2, commutative: false}, + {name: "AbsMaskedInt32x4", argLength: 2, commutative: false}, + {name: "AbsMaskedInt32x8", argLength: 2, commutative: false}, + {name: "AbsMaskedInt32x16", argLength: 2, commutative: false}, + {name: "AbsMaskedInt64x2", argLength: 2, commutative: false}, + {name: "AbsMaskedInt64x4", argLength: 2, commutative: false}, + {name: "AbsMaskedInt64x8", argLength: 2, commutative: false}, + {name: "AddDotProdPairsSaturatedInt32x4", argLength: 3, commutative: false}, + {name: "AddDotProdPairsSaturatedInt32x8", argLength: 3, commutative: false}, + {name: "AddDotProdPairsSaturatedInt32x16", argLength: 3, commutative: false}, + {name: "AddDotProdPairsSaturatedMaskedInt32x4", argLength: 4, commutative: false}, + {name: "AddDotProdPairsSaturatedMaskedInt32x8", argLength: 4, commutative: false}, + {name: "AddDotProdPairsSaturatedMaskedInt32x16", argLength: 4, commutative: false}, + {name: "AddDotProdQuadrupleInt32x4", argLength: 3, commutative: false}, + {name: "AddDotProdQuadrupleInt32x8", argLength: 3, commutative: false}, + {name: "AddDotProdQuadrupleInt32x16", argLength: 3, commutative: false}, + {name: "AddDotProdQuadrupleMaskedInt32x4", argLength: 4, commutative: false}, + {name: "AddDotProdQuadrupleMaskedInt32x8", argLength: 4, commutative: false}, + {name: "AddDotProdQuadrupleMaskedInt32x16", argLength: 4, commutative: false}, + {name: "AddDotProdQuadrupleSaturatedInt32x4", argLength: 3, commutative: false}, + {name: "AddDotProdQuadrupleSaturatedInt32x8", argLength: 3, commutative: false}, + {name: "AddDotProdQuadrupleSaturatedInt32x16", argLength: 3, commutative: false}, + {name: "AddDotProdQuadrupleSaturatedMaskedInt32x4", argLength: 4, commutative: false}, + {name: "AddDotProdQuadrupleSaturatedMaskedInt32x8", argLength: 4, commutative: false}, + {name: "AddDotProdQuadrupleSaturatedMaskedInt32x16", argLength: 4, commutative: false}, {name: "AddFloat32x4", argLength: 2, commutative: true}, {name: "AddFloat32x8", argLength: 2, commutative: true}, {name: "AddFloat32x16", argLength: 2, commutative: true}, @@ -207,30 +219,6 @@ func simdGenericOps() []opData { {name: "AndUint64x2", argLength: 2, commutative: true}, {name: "AndUint64x4", argLength: 2, commutative: true}, {name: "AndUint64x8", argLength: 2, commutative: true}, - {name: "ApproximateReciprocalFloat32x4", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalFloat32x8", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalFloat32x16", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalFloat64x2", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalFloat64x4", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalFloat64x8", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalMaskedFloat32x4", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalMaskedFloat32x8", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalMaskedFloat32x16", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalMaskedFloat64x2", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalMaskedFloat64x4", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalMaskedFloat64x8", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalOfSqrtFloat32x4", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalOfSqrtFloat32x8", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalOfSqrtFloat32x16", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalOfSqrtFloat64x2", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalOfSqrtFloat64x4", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalOfSqrtFloat64x8", argLength: 1, commutative: false}, - {name: "ApproximateReciprocalOfSqrtMaskedFloat32x4", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalOfSqrtMaskedFloat32x8", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalOfSqrtMaskedFloat32x16", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalOfSqrtMaskedFloat64x2", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalOfSqrtMaskedFloat64x4", argLength: 2, commutative: false}, - {name: "ApproximateReciprocalOfSqrtMaskedFloat64x8", argLength: 2, commutative: false}, {name: "AverageMaskedUint8x16", argLength: 3, commutative: true}, {name: "AverageMaskedUint8x32", argLength: 3, commutative: true}, {name: "AverageMaskedUint8x64", argLength: 3, commutative: true}, @@ -289,6 +277,12 @@ func simdGenericOps() []opData { {name: "ConvertToUint32MaskedFloat32x4", argLength: 2, commutative: false}, {name: "ConvertToUint32MaskedFloat32x8", argLength: 2, commutative: false}, {name: "ConvertToUint32MaskedFloat32x16", argLength: 2, commutative: false}, + {name: "CopySignInt8x16", argLength: 2, commutative: false}, + {name: "CopySignInt8x32", argLength: 2, commutative: false}, + {name: "CopySignInt16x8", argLength: 2, commutative: false}, + {name: "CopySignInt16x16", argLength: 2, commutative: false}, + {name: "CopySignInt32x4", argLength: 2, commutative: false}, + {name: "CopySignInt32x8", argLength: 2, commutative: false}, {name: "DivFloat32x4", argLength: 2, commutative: false}, {name: "DivFloat32x8", argLength: 2, commutative: false}, {name: "DivFloat32x16", argLength: 2, commutative: false}, @@ -301,9 +295,18 @@ func simdGenericOps() []opData { {name: "DivMaskedFloat64x2", argLength: 3, commutative: false}, {name: "DivMaskedFloat64x4", argLength: 3, commutative: false}, {name: "DivMaskedFloat64x8", argLength: 3, commutative: false}, - {name: "DotProdBroadcastFloat32x4", argLength: 2, commutative: true}, - {name: "DotProdBroadcastFloat32x8", argLength: 2, commutative: true}, - {name: "DotProdBroadcastFloat64x2", argLength: 2, commutative: true}, + {name: "DotProdPairsInt16x8", argLength: 2, commutative: false}, + {name: "DotProdPairsInt16x16", argLength: 2, commutative: false}, + {name: "DotProdPairsInt16x32", argLength: 2, commutative: false}, + {name: "DotProdPairsMaskedInt16x8", argLength: 3, commutative: false}, + {name: "DotProdPairsMaskedInt16x16", argLength: 3, commutative: false}, + {name: "DotProdPairsMaskedInt16x32", argLength: 3, commutative: false}, + {name: "DotProdPairsSaturatedMaskedUint8x16", argLength: 3, commutative: false}, + {name: "DotProdPairsSaturatedMaskedUint8x32", argLength: 3, commutative: false}, + {name: "DotProdPairsSaturatedMaskedUint8x64", argLength: 3, commutative: false}, + {name: "DotProdPairsSaturatedUint8x16", argLength: 2, commutative: false}, + {name: "DotProdPairsSaturatedUint8x32", argLength: 2, commutative: false}, + {name: "DotProdPairsSaturatedUint8x64", argLength: 2, commutative: false}, {name: "EqualFloat32x4", argLength: 2, commutative: true}, {name: "EqualFloat32x8", argLength: 2, commutative: true}, {name: "EqualFloat32x16", argLength: 2, commutative: true}, @@ -398,42 +401,6 @@ func simdGenericOps() []opData { {name: "FloorFloat32x8", argLength: 1, commutative: false}, {name: "FloorFloat64x2", argLength: 1, commutative: false}, {name: "FloorFloat64x4", argLength: 1, commutative: false}, - {name: "FusedMultiplyAddFloat32x4", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddFloat32x8", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddFloat32x16", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddFloat64x2", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddFloat64x4", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddFloat64x8", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddMaskedFloat32x4", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddMaskedFloat32x8", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddMaskedFloat32x16", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddMaskedFloat64x2", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddMaskedFloat64x4", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddMaskedFloat64x8", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddSubFloat32x4", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddSubFloat32x8", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddSubFloat32x16", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddSubFloat64x2", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddSubFloat64x4", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddSubFloat64x8", argLength: 3, commutative: false}, - {name: "FusedMultiplyAddSubMaskedFloat32x4", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddSubMaskedFloat32x8", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddSubMaskedFloat32x16", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddSubMaskedFloat64x2", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddSubMaskedFloat64x4", argLength: 4, commutative: false}, - {name: "FusedMultiplyAddSubMaskedFloat64x8", argLength: 4, commutative: false}, - {name: "FusedMultiplySubAddFloat32x4", argLength: 3, commutative: false}, - {name: "FusedMultiplySubAddFloat32x8", argLength: 3, commutative: false}, - {name: "FusedMultiplySubAddFloat32x16", argLength: 3, commutative: false}, - {name: "FusedMultiplySubAddFloat64x2", argLength: 3, commutative: false}, - {name: "FusedMultiplySubAddFloat64x4", argLength: 3, commutative: false}, - {name: "FusedMultiplySubAddFloat64x8", argLength: 3, commutative: false}, - {name: "FusedMultiplySubAddMaskedFloat32x4", argLength: 4, commutative: false}, - {name: "FusedMultiplySubAddMaskedFloat32x8", argLength: 4, commutative: false}, - {name: "FusedMultiplySubAddMaskedFloat32x16", argLength: 4, commutative: false}, - {name: "FusedMultiplySubAddMaskedFloat64x2", argLength: 4, commutative: false}, - {name: "FusedMultiplySubAddMaskedFloat64x4", argLength: 4, commutative: false}, - {name: "FusedMultiplySubAddMaskedFloat64x8", argLength: 4, commutative: false}, {name: "GaloisFieldMulMaskedUint8x16", argLength: 3, commutative: false}, {name: "GaloisFieldMulMaskedUint8x32", argLength: 3, commutative: false}, {name: "GaloisFieldMulMaskedUint8x64", argLength: 3, commutative: false}, @@ -852,22 +819,34 @@ func simdGenericOps() []opData { {name: "MinUint64x2", argLength: 2, commutative: true}, {name: "MinUint64x4", argLength: 2, commutative: true}, {name: "MinUint64x8", argLength: 2, commutative: true}, + {name: "MulAddFloat32x4", argLength: 3, commutative: false}, + {name: "MulAddFloat32x8", argLength: 3, commutative: false}, + {name: "MulAddFloat32x16", argLength: 3, commutative: false}, + {name: "MulAddFloat64x2", argLength: 3, commutative: false}, + {name: "MulAddFloat64x4", argLength: 3, commutative: false}, + {name: "MulAddFloat64x8", argLength: 3, commutative: false}, + {name: "MulAddMaskedFloat32x4", argLength: 4, commutative: false}, + {name: "MulAddMaskedFloat32x8", argLength: 4, commutative: false}, + {name: "MulAddMaskedFloat32x16", argLength: 4, commutative: false}, + {name: "MulAddMaskedFloat64x2", argLength: 4, commutative: false}, + {name: "MulAddMaskedFloat64x4", argLength: 4, commutative: false}, + {name: "MulAddMaskedFloat64x8", argLength: 4, commutative: false}, + {name: "MulAddSubFloat32x4", argLength: 3, commutative: false}, + {name: "MulAddSubFloat32x8", argLength: 3, commutative: false}, + {name: "MulAddSubFloat32x16", argLength: 3, commutative: false}, + {name: "MulAddSubFloat64x2", argLength: 3, commutative: false}, + {name: "MulAddSubFloat64x4", argLength: 3, commutative: false}, + {name: "MulAddSubFloat64x8", argLength: 3, commutative: false}, + {name: "MulAddSubMaskedFloat32x4", argLength: 4, commutative: false}, + {name: "MulAddSubMaskedFloat32x8", argLength: 4, commutative: false}, + {name: "MulAddSubMaskedFloat32x16", argLength: 4, commutative: false}, + {name: "MulAddSubMaskedFloat64x2", argLength: 4, commutative: false}, + {name: "MulAddSubMaskedFloat64x4", argLength: 4, commutative: false}, + {name: "MulAddSubMaskedFloat64x8", argLength: 4, commutative: false}, {name: "MulEvenWidenInt32x4", argLength: 2, commutative: true}, {name: "MulEvenWidenInt32x8", argLength: 2, commutative: true}, - {name: "MulEvenWidenInt64x2", argLength: 2, commutative: true}, - {name: "MulEvenWidenInt64x4", argLength: 2, commutative: true}, - {name: "MulEvenWidenInt64x8", argLength: 2, commutative: true}, - {name: "MulEvenWidenMaskedInt64x2", argLength: 3, commutative: true}, - {name: "MulEvenWidenMaskedInt64x4", argLength: 3, commutative: true}, - {name: "MulEvenWidenMaskedInt64x8", argLength: 3, commutative: true}, - {name: "MulEvenWidenMaskedUint64x2", argLength: 3, commutative: true}, - {name: "MulEvenWidenMaskedUint64x4", argLength: 3, commutative: true}, - {name: "MulEvenWidenMaskedUint64x8", argLength: 3, commutative: true}, {name: "MulEvenWidenUint32x4", argLength: 2, commutative: true}, {name: "MulEvenWidenUint32x8", argLength: 2, commutative: true}, - {name: "MulEvenWidenUint64x2", argLength: 2, commutative: true}, - {name: "MulEvenWidenUint64x4", argLength: 2, commutative: true}, - {name: "MulEvenWidenUint64x8", argLength: 2, commutative: true}, {name: "MulFloat32x4", argLength: 2, commutative: true}, {name: "MulFloat32x8", argLength: 2, commutative: true}, {name: "MulFloat32x16", argLength: 2, commutative: true}, @@ -880,12 +859,6 @@ func simdGenericOps() []opData { {name: "MulHighMaskedInt16x8", argLength: 3, commutative: true}, {name: "MulHighMaskedInt16x16", argLength: 3, commutative: true}, {name: "MulHighMaskedInt16x32", argLength: 3, commutative: true}, - {name: "MulHighMaskedUint16x8", argLength: 3, commutative: true}, - {name: "MulHighMaskedUint16x16", argLength: 3, commutative: true}, - {name: "MulHighMaskedUint16x32", argLength: 3, commutative: true}, - {name: "MulHighUint16x8", argLength: 2, commutative: true}, - {name: "MulHighUint16x16", argLength: 2, commutative: true}, - {name: "MulHighUint16x32", argLength: 2, commutative: true}, {name: "MulInt16x8", argLength: 2, commutative: true}, {name: "MulInt16x16", argLength: 2, commutative: true}, {name: "MulInt16x32", argLength: 2, commutative: true}, @@ -910,6 +883,36 @@ func simdGenericOps() []opData { {name: "MulMaskedInt64x2", argLength: 3, commutative: true}, {name: "MulMaskedInt64x4", argLength: 3, commutative: true}, {name: "MulMaskedInt64x8", argLength: 3, commutative: true}, + {name: "MulMaskedUint16x8", argLength: 3, commutative: true}, + {name: "MulMaskedUint16x16", argLength: 3, commutative: true}, + {name: "MulMaskedUint16x32", argLength: 3, commutative: true}, + {name: "MulMaskedUint32x4", argLength: 3, commutative: true}, + {name: "MulMaskedUint32x8", argLength: 3, commutative: true}, + {name: "MulMaskedUint32x16", argLength: 3, commutative: true}, + {name: "MulMaskedUint64x2", argLength: 3, commutative: true}, + {name: "MulMaskedUint64x4", argLength: 3, commutative: true}, + {name: "MulMaskedUint64x8", argLength: 3, commutative: true}, + {name: "MulSubAddFloat32x4", argLength: 3, commutative: false}, + {name: "MulSubAddFloat32x8", argLength: 3, commutative: false}, + {name: "MulSubAddFloat32x16", argLength: 3, commutative: false}, + {name: "MulSubAddFloat64x2", argLength: 3, commutative: false}, + {name: "MulSubAddFloat64x4", argLength: 3, commutative: false}, + {name: "MulSubAddFloat64x8", argLength: 3, commutative: false}, + {name: "MulSubAddMaskedFloat32x4", argLength: 4, commutative: false}, + {name: "MulSubAddMaskedFloat32x8", argLength: 4, commutative: false}, + {name: "MulSubAddMaskedFloat32x16", argLength: 4, commutative: false}, + {name: "MulSubAddMaskedFloat64x2", argLength: 4, commutative: false}, + {name: "MulSubAddMaskedFloat64x4", argLength: 4, commutative: false}, + {name: "MulSubAddMaskedFloat64x8", argLength: 4, commutative: false}, + {name: "MulUint16x8", argLength: 2, commutative: true}, + {name: "MulUint16x16", argLength: 2, commutative: true}, + {name: "MulUint16x32", argLength: 2, commutative: true}, + {name: "MulUint32x4", argLength: 2, commutative: true}, + {name: "MulUint32x8", argLength: 2, commutative: true}, + {name: "MulUint32x16", argLength: 2, commutative: true}, + {name: "MulUint64x2", argLength: 2, commutative: true}, + {name: "MulUint64x4", argLength: 2, commutative: true}, + {name: "MulUint64x8", argLength: 2, commutative: true}, {name: "NotEqualFloat32x4", argLength: 2, commutative: true}, {name: "NotEqualFloat32x8", argLength: 2, commutative: true}, {name: "NotEqualFloat32x16", argLength: 2, commutative: true}, @@ -970,6 +973,54 @@ func simdGenericOps() []opData { {name: "NotEqualUint64x2", argLength: 2, commutative: true}, {name: "NotEqualUint64x4", argLength: 2, commutative: true}, {name: "NotEqualUint64x8", argLength: 2, commutative: true}, + {name: "OnesCountInt8x16", argLength: 1, commutative: false}, + {name: "OnesCountInt8x32", argLength: 1, commutative: false}, + {name: "OnesCountInt8x64", argLength: 1, commutative: false}, + {name: "OnesCountInt16x8", argLength: 1, commutative: false}, + {name: "OnesCountInt16x16", argLength: 1, commutative: false}, + {name: "OnesCountInt16x32", argLength: 1, commutative: false}, + {name: "OnesCountInt32x4", argLength: 1, commutative: false}, + {name: "OnesCountInt32x8", argLength: 1, commutative: false}, + {name: "OnesCountInt32x16", argLength: 1, commutative: false}, + {name: "OnesCountInt64x2", argLength: 1, commutative: false}, + {name: "OnesCountInt64x4", argLength: 1, commutative: false}, + {name: "OnesCountInt64x8", argLength: 1, commutative: false}, + {name: "OnesCountMaskedInt8x16", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt8x32", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt8x64", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt16x8", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt16x16", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt16x32", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt32x4", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt32x8", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt32x16", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt64x2", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt64x4", argLength: 2, commutative: false}, + {name: "OnesCountMaskedInt64x8", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint8x16", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint8x32", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint8x64", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint16x8", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint16x16", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint16x32", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint32x4", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint32x8", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint32x16", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint64x2", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint64x4", argLength: 2, commutative: false}, + {name: "OnesCountMaskedUint64x8", argLength: 2, commutative: false}, + {name: "OnesCountUint8x16", argLength: 1, commutative: false}, + {name: "OnesCountUint8x32", argLength: 1, commutative: false}, + {name: "OnesCountUint8x64", argLength: 1, commutative: false}, + {name: "OnesCountUint16x8", argLength: 1, commutative: false}, + {name: "OnesCountUint16x16", argLength: 1, commutative: false}, + {name: "OnesCountUint16x32", argLength: 1, commutative: false}, + {name: "OnesCountUint32x4", argLength: 1, commutative: false}, + {name: "OnesCountUint32x8", argLength: 1, commutative: false}, + {name: "OnesCountUint32x16", argLength: 1, commutative: false}, + {name: "OnesCountUint64x2", argLength: 1, commutative: false}, + {name: "OnesCountUint64x4", argLength: 1, commutative: false}, + {name: "OnesCountUint64x8", argLength: 1, commutative: false}, {name: "OrInt8x16", argLength: 2, commutative: true}, {name: "OrInt8x32", argLength: 2, commutative: true}, {name: "OrInt8x64", argLength: 2, commutative: true}, @@ -1006,12 +1057,6 @@ func simdGenericOps() []opData { {name: "OrUint64x2", argLength: 2, commutative: true}, {name: "OrUint64x4", argLength: 2, commutative: true}, {name: "OrUint64x8", argLength: 2, commutative: true}, - {name: "PairDotProdInt16x8", argLength: 2, commutative: false}, - {name: "PairDotProdInt16x16", argLength: 2, commutative: false}, - {name: "PairDotProdInt16x32", argLength: 2, commutative: false}, - {name: "PairDotProdMaskedInt16x8", argLength: 3, commutative: false}, - {name: "PairDotProdMaskedInt16x16", argLength: 3, commutative: false}, - {name: "PairDotProdMaskedInt16x32", argLength: 3, commutative: false}, {name: "Permute2Float32x4", argLength: 3, commutative: false}, {name: "Permute2Float32x8", argLength: 3, commutative: false}, {name: "Permute2Float32x16", argLength: 3, commutative: false}, @@ -1120,54 +1165,30 @@ func simdGenericOps() []opData { {name: "PermuteUint32x16", argLength: 2, commutative: false}, {name: "PermuteUint64x4", argLength: 2, commutative: false}, {name: "PermuteUint64x8", argLength: 2, commutative: false}, - {name: "PopCountInt8x16", argLength: 1, commutative: false}, - {name: "PopCountInt8x32", argLength: 1, commutative: false}, - {name: "PopCountInt8x64", argLength: 1, commutative: false}, - {name: "PopCountInt16x8", argLength: 1, commutative: false}, - {name: "PopCountInt16x16", argLength: 1, commutative: false}, - {name: "PopCountInt16x32", argLength: 1, commutative: false}, - {name: "PopCountInt32x4", argLength: 1, commutative: false}, - {name: "PopCountInt32x8", argLength: 1, commutative: false}, - {name: "PopCountInt32x16", argLength: 1, commutative: false}, - {name: "PopCountInt64x2", argLength: 1, commutative: false}, - {name: "PopCountInt64x4", argLength: 1, commutative: false}, - {name: "PopCountInt64x8", argLength: 1, commutative: false}, - {name: "PopCountMaskedInt8x16", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt8x32", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt8x64", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt16x8", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt16x16", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt16x32", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt32x4", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt32x8", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt32x16", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt64x2", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt64x4", argLength: 2, commutative: false}, - {name: "PopCountMaskedInt64x8", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint8x16", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint8x32", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint8x64", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint16x8", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint16x16", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint16x32", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint32x4", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint32x8", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint32x16", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint64x2", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint64x4", argLength: 2, commutative: false}, - {name: "PopCountMaskedUint64x8", argLength: 2, commutative: false}, - {name: "PopCountUint8x16", argLength: 1, commutative: false}, - {name: "PopCountUint8x32", argLength: 1, commutative: false}, - {name: "PopCountUint8x64", argLength: 1, commutative: false}, - {name: "PopCountUint16x8", argLength: 1, commutative: false}, - {name: "PopCountUint16x16", argLength: 1, commutative: false}, - {name: "PopCountUint16x32", argLength: 1, commutative: false}, - {name: "PopCountUint32x4", argLength: 1, commutative: false}, - {name: "PopCountUint32x8", argLength: 1, commutative: false}, - {name: "PopCountUint32x16", argLength: 1, commutative: false}, - {name: "PopCountUint64x2", argLength: 1, commutative: false}, - {name: "PopCountUint64x4", argLength: 1, commutative: false}, - {name: "PopCountUint64x8", argLength: 1, commutative: false}, + {name: "ReciprocalFloat32x4", argLength: 1, commutative: false}, + {name: "ReciprocalFloat32x8", argLength: 1, commutative: false}, + {name: "ReciprocalFloat32x16", argLength: 1, commutative: false}, + {name: "ReciprocalFloat64x2", argLength: 1, commutative: false}, + {name: "ReciprocalFloat64x4", argLength: 1, commutative: false}, + {name: "ReciprocalFloat64x8", argLength: 1, commutative: false}, + {name: "ReciprocalMaskedFloat32x4", argLength: 2, commutative: false}, + {name: "ReciprocalMaskedFloat32x8", argLength: 2, commutative: false}, + {name: "ReciprocalMaskedFloat32x16", argLength: 2, commutative: false}, + {name: "ReciprocalMaskedFloat64x2", argLength: 2, commutative: false}, + {name: "ReciprocalMaskedFloat64x4", argLength: 2, commutative: false}, + {name: "ReciprocalMaskedFloat64x8", argLength: 2, commutative: false}, + {name: "ReciprocalSqrtFloat32x4", argLength: 1, commutative: false}, + {name: "ReciprocalSqrtFloat32x8", argLength: 1, commutative: false}, + {name: "ReciprocalSqrtFloat32x16", argLength: 1, commutative: false}, + {name: "ReciprocalSqrtFloat64x2", argLength: 1, commutative: false}, + {name: "ReciprocalSqrtFloat64x4", argLength: 1, commutative: false}, + {name: "ReciprocalSqrtFloat64x8", argLength: 1, commutative: false}, + {name: "ReciprocalSqrtMaskedFloat32x4", argLength: 2, commutative: false}, + {name: "ReciprocalSqrtMaskedFloat32x8", argLength: 2, commutative: false}, + {name: "ReciprocalSqrtMaskedFloat32x16", argLength: 2, commutative: false}, + {name: "ReciprocalSqrtMaskedFloat64x2", argLength: 2, commutative: false}, + {name: "ReciprocalSqrtMaskedFloat64x4", argLength: 2, commutative: false}, + {name: "ReciprocalSqrtMaskedFloat64x8", argLength: 2, commutative: false}, {name: "RotateLeftInt32x4", argLength: 2, commutative: false}, {name: "RotateLeftInt32x8", argLength: 2, commutative: false}, {name: "RotateLeftInt32x16", argLength: 2, commutative: false}, @@ -1216,28 +1237,10 @@ func simdGenericOps() []opData { {name: "RotateRightUint64x2", argLength: 2, commutative: false}, {name: "RotateRightUint64x4", argLength: 2, commutative: false}, {name: "RotateRightUint64x8", argLength: 2, commutative: false}, - {name: "RoundFloat32x4", argLength: 1, commutative: false}, - {name: "RoundFloat32x8", argLength: 1, commutative: false}, - {name: "RoundFloat64x2", argLength: 1, commutative: false}, - {name: "RoundFloat64x4", argLength: 1, commutative: false}, - {name: "SaturatedAddDotProdInt32x4", argLength: 3, commutative: false}, - {name: "SaturatedAddDotProdInt32x8", argLength: 3, commutative: false}, - {name: "SaturatedAddDotProdInt32x16", argLength: 3, commutative: false}, - {name: "SaturatedAddDotProdMaskedInt32x4", argLength: 4, commutative: false}, - {name: "SaturatedAddDotProdMaskedInt32x8", argLength: 4, commutative: false}, - {name: "SaturatedAddDotProdMaskedInt32x16", argLength: 4, commutative: false}, - {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x16", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x32", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x64", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedPairDotProdUint8x16", argLength: 2, commutative: false}, - {name: "SaturatedUnsignedSignedPairDotProdUint8x32", argLength: 2, commutative: false}, - {name: "SaturatedUnsignedSignedPairDotProdUint8x64", argLength: 2, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false}, + {name: "RoundToEvenFloat32x4", argLength: 1, commutative: false}, + {name: "RoundToEvenFloat32x8", argLength: 1, commutative: false}, + {name: "RoundToEvenFloat64x2", argLength: 1, commutative: false}, + {name: "RoundToEvenFloat64x4", argLength: 1, commutative: false}, {name: "ScaleFloat32x4", argLength: 2, commutative: false}, {name: "ScaleFloat32x8", argLength: 2, commutative: false}, {name: "ScaleFloat32x16", argLength: 2, commutative: false}, @@ -1506,12 +1509,6 @@ func simdGenericOps() []opData { {name: "ShiftRightUint64x2", argLength: 2, commutative: false}, {name: "ShiftRightUint64x4", argLength: 2, commutative: false}, {name: "ShiftRightUint64x8", argLength: 2, commutative: false}, - {name: "SignInt8x16", argLength: 2, commutative: false}, - {name: "SignInt8x32", argLength: 2, commutative: false}, - {name: "SignInt16x8", argLength: 2, commutative: false}, - {name: "SignInt16x16", argLength: 2, commutative: false}, - {name: "SignInt32x4", argLength: 2, commutative: false}, - {name: "SignInt32x8", argLength: 2, commutative: false}, {name: "SqrtFloat32x4", argLength: 1, commutative: false}, {name: "SqrtFloat32x8", argLength: 1, commutative: false}, {name: "SqrtFloat32x16", argLength: 1, commutative: false}, @@ -1626,12 +1623,6 @@ func simdGenericOps() []opData { {name: "TruncFloat32x8", argLength: 1, commutative: false}, {name: "TruncFloat64x2", argLength: 1, commutative: false}, {name: "TruncFloat64x4", argLength: 1, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateInt32x4", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateInt32x8", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateInt32x16", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false}, {name: "XorInt8x16", argLength: 2, commutative: true}, {name: "XorInt8x32", argLength: 2, commutative: true}, {name: "XorInt8x64", argLength: 2, commutative: true}, @@ -1790,30 +1781,30 @@ func simdGenericOps() []opData { {name: "RotateAllRightUint64x2", argLength: 1, commutative: false, aux: "Int8"}, {name: "RotateAllRightUint64x4", argLength: 1, commutative: false, aux: "Int8"}, {name: "RotateAllRightUint64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundToEvenScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "Int8"}, {name: "SetElemInt16x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "SetElemInt32x4", argLength: 2, commutative: false, aux: "Int8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 7c135ea692..8bf850d78e 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1418,6 +1418,18 @@ const ( OpAMD64VPADDSWMasked128 OpAMD64VPADDSWMasked256 OpAMD64VPADDSWMasked512 + OpAMD64VPADDUSB128 + OpAMD64VPADDUSB256 + OpAMD64VPADDUSB512 + OpAMD64VPADDUSBMasked128 + OpAMD64VPADDUSBMasked256 + OpAMD64VPADDUSBMasked512 + OpAMD64VPADDUSW128 + OpAMD64VPADDUSW256 + OpAMD64VPADDUSW512 + OpAMD64VPADDUSWMasked128 + OpAMD64VPADDUSWMasked256 + OpAMD64VPADDUSWMasked512 OpAMD64VPADDW128 OpAMD64VPADDW256 OpAMD64VPADDW512 @@ -1720,22 +1732,12 @@ const ( OpAMD64VPMINUWMasked512 OpAMD64VPMULDQ128 OpAMD64VPMULDQ256 - OpAMD64VPMULDQ512 - OpAMD64VPMULDQMasked128 - OpAMD64VPMULDQMasked256 - OpAMD64VPMULDQMasked512 OpAMD64VPMULHUW128 OpAMD64VPMULHUW256 - OpAMD64VPMULHUW512 OpAMD64VPMULHUWMasked128 - OpAMD64VPMULHUWMasked256 OpAMD64VPMULHUWMasked512 - OpAMD64VPMULHW128 - OpAMD64VPMULHW256 OpAMD64VPMULHW512 - OpAMD64VPMULHWMasked128 OpAMD64VPMULHWMasked256 - OpAMD64VPMULHWMasked512 OpAMD64VPMULLD128 OpAMD64VPMULLD256 OpAMD64VPMULLD512 @@ -1756,10 +1758,6 @@ const ( OpAMD64VPMULLWMasked512 OpAMD64VPMULUDQ128 OpAMD64VPMULUDQ256 - OpAMD64VPMULUDQ512 - OpAMD64VPMULUDQMasked128 - OpAMD64VPMULUDQMasked256 - OpAMD64VPMULUDQMasked512 OpAMD64VPOPCNTB128 OpAMD64VPOPCNTB256 OpAMD64VPOPCNTB512 @@ -1998,6 +1996,18 @@ const ( OpAMD64VPSUBSWMasked128 OpAMD64VPSUBSWMasked256 OpAMD64VPSUBSWMasked512 + OpAMD64VPSUBUSB128 + OpAMD64VPSUBUSB256 + OpAMD64VPSUBUSB512 + OpAMD64VPSUBUSBMasked128 + OpAMD64VPSUBUSBMasked256 + OpAMD64VPSUBUSBMasked512 + OpAMD64VPSUBUSW128 + OpAMD64VPSUBUSW256 + OpAMD64VPSUBUSW512 + OpAMD64VPSUBUSWMasked128 + OpAMD64VPSUBUSWMasked256 + OpAMD64VPSUBUSWMasked512 OpAMD64VPSUBW128 OpAMD64VPSUBW256 OpAMD64VPSUBW512 @@ -2102,9 +2112,6 @@ const ( OpAMD64VREDUCEPDMasked128 OpAMD64VREDUCEPDMasked256 OpAMD64VREDUCEPDMasked512 - OpAMD64VDPPS128 - OpAMD64VDPPS256 - OpAMD64VDPPD128 OpAMD64VCMPPS128 OpAMD64VCMPPS256 OpAMD64VCMPPS512 @@ -4598,36 +4605,48 @@ const ( OpCvtMask64x2to8 OpCvtMask64x4to8 OpCvtMask64x8to8 - OpAbsoluteInt8x16 - OpAbsoluteInt8x32 - OpAbsoluteInt8x64 - OpAbsoluteInt16x8 - OpAbsoluteInt16x16 - OpAbsoluteInt16x32 - OpAbsoluteInt32x4 - OpAbsoluteInt32x8 - OpAbsoluteInt32x16 - OpAbsoluteInt64x2 - OpAbsoluteInt64x4 - OpAbsoluteInt64x8 - OpAbsoluteMaskedInt8x16 - OpAbsoluteMaskedInt8x32 - OpAbsoluteMaskedInt8x64 - OpAbsoluteMaskedInt16x8 - OpAbsoluteMaskedInt16x16 - OpAbsoluteMaskedInt16x32 - OpAbsoluteMaskedInt32x4 - OpAbsoluteMaskedInt32x8 - OpAbsoluteMaskedInt32x16 - OpAbsoluteMaskedInt64x2 - OpAbsoluteMaskedInt64x4 - OpAbsoluteMaskedInt64x8 - OpAddDotProdInt32x4 - OpAddDotProdInt32x8 - OpAddDotProdInt32x16 - OpAddDotProdMaskedInt32x4 - OpAddDotProdMaskedInt32x8 - OpAddDotProdMaskedInt32x16 + OpAbsInt8x16 + OpAbsInt8x32 + OpAbsInt8x64 + OpAbsInt16x8 + OpAbsInt16x16 + OpAbsInt16x32 + OpAbsInt32x4 + OpAbsInt32x8 + OpAbsInt32x16 + OpAbsInt64x2 + OpAbsInt64x4 + OpAbsInt64x8 + OpAbsMaskedInt8x16 + OpAbsMaskedInt8x32 + OpAbsMaskedInt8x64 + OpAbsMaskedInt16x8 + OpAbsMaskedInt16x16 + OpAbsMaskedInt16x32 + OpAbsMaskedInt32x4 + OpAbsMaskedInt32x8 + OpAbsMaskedInt32x16 + OpAbsMaskedInt64x2 + OpAbsMaskedInt64x4 + OpAbsMaskedInt64x8 + OpAddDotProdPairsSaturatedInt32x4 + OpAddDotProdPairsSaturatedInt32x8 + OpAddDotProdPairsSaturatedInt32x16 + OpAddDotProdPairsSaturatedMaskedInt32x4 + OpAddDotProdPairsSaturatedMaskedInt32x8 + OpAddDotProdPairsSaturatedMaskedInt32x16 + OpAddDotProdQuadrupleInt32x4 + OpAddDotProdQuadrupleInt32x8 + OpAddDotProdQuadrupleInt32x16 + OpAddDotProdQuadrupleMaskedInt32x4 + OpAddDotProdQuadrupleMaskedInt32x8 + OpAddDotProdQuadrupleMaskedInt32x16 + OpAddDotProdQuadrupleSaturatedInt32x4 + OpAddDotProdQuadrupleSaturatedInt32x8 + OpAddDotProdQuadrupleSaturatedInt32x16 + OpAddDotProdQuadrupleSaturatedMaskedInt32x4 + OpAddDotProdQuadrupleSaturatedMaskedInt32x8 + OpAddDotProdQuadrupleSaturatedMaskedInt32x16 OpAddFloat32x4 OpAddFloat32x8 OpAddFloat32x16 @@ -4802,30 +4821,6 @@ const ( OpAndUint64x2 OpAndUint64x4 OpAndUint64x8 - OpApproximateReciprocalFloat32x4 - OpApproximateReciprocalFloat32x8 - OpApproximateReciprocalFloat32x16 - OpApproximateReciprocalFloat64x2 - OpApproximateReciprocalFloat64x4 - OpApproximateReciprocalFloat64x8 - OpApproximateReciprocalMaskedFloat32x4 - OpApproximateReciprocalMaskedFloat32x8 - OpApproximateReciprocalMaskedFloat32x16 - OpApproximateReciprocalMaskedFloat64x2 - OpApproximateReciprocalMaskedFloat64x4 - OpApproximateReciprocalMaskedFloat64x8 - OpApproximateReciprocalOfSqrtFloat32x4 - OpApproximateReciprocalOfSqrtFloat32x8 - OpApproximateReciprocalOfSqrtFloat32x16 - OpApproximateReciprocalOfSqrtFloat64x2 - OpApproximateReciprocalOfSqrtFloat64x4 - OpApproximateReciprocalOfSqrtFloat64x8 - OpApproximateReciprocalOfSqrtMaskedFloat32x4 - OpApproximateReciprocalOfSqrtMaskedFloat32x8 - OpApproximateReciprocalOfSqrtMaskedFloat32x16 - OpApproximateReciprocalOfSqrtMaskedFloat64x2 - OpApproximateReciprocalOfSqrtMaskedFloat64x4 - OpApproximateReciprocalOfSqrtMaskedFloat64x8 OpAverageMaskedUint8x16 OpAverageMaskedUint8x32 OpAverageMaskedUint8x64 @@ -4884,6 +4879,12 @@ const ( OpConvertToUint32MaskedFloat32x4 OpConvertToUint32MaskedFloat32x8 OpConvertToUint32MaskedFloat32x16 + OpCopySignInt8x16 + OpCopySignInt8x32 + OpCopySignInt16x8 + OpCopySignInt16x16 + OpCopySignInt32x4 + OpCopySignInt32x8 OpDivFloat32x4 OpDivFloat32x8 OpDivFloat32x16 @@ -4896,9 +4897,18 @@ const ( OpDivMaskedFloat64x2 OpDivMaskedFloat64x4 OpDivMaskedFloat64x8 - OpDotProdBroadcastFloat32x4 - OpDotProdBroadcastFloat32x8 - OpDotProdBroadcastFloat64x2 + OpDotProdPairsInt16x8 + OpDotProdPairsInt16x16 + OpDotProdPairsInt16x32 + OpDotProdPairsMaskedInt16x8 + OpDotProdPairsMaskedInt16x16 + OpDotProdPairsMaskedInt16x32 + OpDotProdPairsSaturatedMaskedUint8x16 + OpDotProdPairsSaturatedMaskedUint8x32 + OpDotProdPairsSaturatedMaskedUint8x64 + OpDotProdPairsSaturatedUint8x16 + OpDotProdPairsSaturatedUint8x32 + OpDotProdPairsSaturatedUint8x64 OpEqualFloat32x4 OpEqualFloat32x8 OpEqualFloat32x16 @@ -4993,42 +5003,6 @@ const ( OpFloorFloat32x8 OpFloorFloat64x2 OpFloorFloat64x4 - OpFusedMultiplyAddFloat32x4 - OpFusedMultiplyAddFloat32x8 - OpFusedMultiplyAddFloat32x16 - OpFusedMultiplyAddFloat64x2 - OpFusedMultiplyAddFloat64x4 - OpFusedMultiplyAddFloat64x8 - OpFusedMultiplyAddMaskedFloat32x4 - OpFusedMultiplyAddMaskedFloat32x8 - OpFusedMultiplyAddMaskedFloat32x16 - OpFusedMultiplyAddMaskedFloat64x2 - OpFusedMultiplyAddMaskedFloat64x4 - OpFusedMultiplyAddMaskedFloat64x8 - OpFusedMultiplyAddSubFloat32x4 - OpFusedMultiplyAddSubFloat32x8 - OpFusedMultiplyAddSubFloat32x16 - OpFusedMultiplyAddSubFloat64x2 - OpFusedMultiplyAddSubFloat64x4 - OpFusedMultiplyAddSubFloat64x8 - OpFusedMultiplyAddSubMaskedFloat32x4 - OpFusedMultiplyAddSubMaskedFloat32x8 - OpFusedMultiplyAddSubMaskedFloat32x16 - OpFusedMultiplyAddSubMaskedFloat64x2 - OpFusedMultiplyAddSubMaskedFloat64x4 - OpFusedMultiplyAddSubMaskedFloat64x8 - OpFusedMultiplySubAddFloat32x4 - OpFusedMultiplySubAddFloat32x8 - OpFusedMultiplySubAddFloat32x16 - OpFusedMultiplySubAddFloat64x2 - OpFusedMultiplySubAddFloat64x4 - OpFusedMultiplySubAddFloat64x8 - OpFusedMultiplySubAddMaskedFloat32x4 - OpFusedMultiplySubAddMaskedFloat32x8 - OpFusedMultiplySubAddMaskedFloat32x16 - OpFusedMultiplySubAddMaskedFloat64x2 - OpFusedMultiplySubAddMaskedFloat64x4 - OpFusedMultiplySubAddMaskedFloat64x8 OpGaloisFieldMulMaskedUint8x16 OpGaloisFieldMulMaskedUint8x32 OpGaloisFieldMulMaskedUint8x64 @@ -5447,22 +5421,34 @@ const ( OpMinUint64x2 OpMinUint64x4 OpMinUint64x8 + OpMulAddFloat32x4 + OpMulAddFloat32x8 + OpMulAddFloat32x16 + OpMulAddFloat64x2 + OpMulAddFloat64x4 + OpMulAddFloat64x8 + OpMulAddMaskedFloat32x4 + OpMulAddMaskedFloat32x8 + OpMulAddMaskedFloat32x16 + OpMulAddMaskedFloat64x2 + OpMulAddMaskedFloat64x4 + OpMulAddMaskedFloat64x8 + OpMulAddSubFloat32x4 + OpMulAddSubFloat32x8 + OpMulAddSubFloat32x16 + OpMulAddSubFloat64x2 + OpMulAddSubFloat64x4 + OpMulAddSubFloat64x8 + OpMulAddSubMaskedFloat32x4 + OpMulAddSubMaskedFloat32x8 + OpMulAddSubMaskedFloat32x16 + OpMulAddSubMaskedFloat64x2 + OpMulAddSubMaskedFloat64x4 + OpMulAddSubMaskedFloat64x8 OpMulEvenWidenInt32x4 OpMulEvenWidenInt32x8 - OpMulEvenWidenInt64x2 - OpMulEvenWidenInt64x4 - OpMulEvenWidenInt64x8 - OpMulEvenWidenMaskedInt64x2 - OpMulEvenWidenMaskedInt64x4 - OpMulEvenWidenMaskedInt64x8 - OpMulEvenWidenMaskedUint64x2 - OpMulEvenWidenMaskedUint64x4 - OpMulEvenWidenMaskedUint64x8 OpMulEvenWidenUint32x4 OpMulEvenWidenUint32x8 - OpMulEvenWidenUint64x2 - OpMulEvenWidenUint64x4 - OpMulEvenWidenUint64x8 OpMulFloat32x4 OpMulFloat32x8 OpMulFloat32x16 @@ -5475,12 +5461,6 @@ const ( OpMulHighMaskedInt16x8 OpMulHighMaskedInt16x16 OpMulHighMaskedInt16x32 - OpMulHighMaskedUint16x8 - OpMulHighMaskedUint16x16 - OpMulHighMaskedUint16x32 - OpMulHighUint16x8 - OpMulHighUint16x16 - OpMulHighUint16x32 OpMulInt16x8 OpMulInt16x16 OpMulInt16x32 @@ -5505,6 +5485,36 @@ const ( OpMulMaskedInt64x2 OpMulMaskedInt64x4 OpMulMaskedInt64x8 + OpMulMaskedUint16x8 + OpMulMaskedUint16x16 + OpMulMaskedUint16x32 + OpMulMaskedUint32x4 + OpMulMaskedUint32x8 + OpMulMaskedUint32x16 + OpMulMaskedUint64x2 + OpMulMaskedUint64x4 + OpMulMaskedUint64x8 + OpMulSubAddFloat32x4 + OpMulSubAddFloat32x8 + OpMulSubAddFloat32x16 + OpMulSubAddFloat64x2 + OpMulSubAddFloat64x4 + OpMulSubAddFloat64x8 + OpMulSubAddMaskedFloat32x4 + OpMulSubAddMaskedFloat32x8 + OpMulSubAddMaskedFloat32x16 + OpMulSubAddMaskedFloat64x2 + OpMulSubAddMaskedFloat64x4 + OpMulSubAddMaskedFloat64x8 + OpMulUint16x8 + OpMulUint16x16 + OpMulUint16x32 + OpMulUint32x4 + OpMulUint32x8 + OpMulUint32x16 + OpMulUint64x2 + OpMulUint64x4 + OpMulUint64x8 OpNotEqualFloat32x4 OpNotEqualFloat32x8 OpNotEqualFloat32x16 @@ -5565,6 +5575,54 @@ const ( OpNotEqualUint64x2 OpNotEqualUint64x4 OpNotEqualUint64x8 + OpOnesCountInt8x16 + OpOnesCountInt8x32 + OpOnesCountInt8x64 + OpOnesCountInt16x8 + OpOnesCountInt16x16 + OpOnesCountInt16x32 + OpOnesCountInt32x4 + OpOnesCountInt32x8 + OpOnesCountInt32x16 + OpOnesCountInt64x2 + OpOnesCountInt64x4 + OpOnesCountInt64x8 + OpOnesCountMaskedInt8x16 + OpOnesCountMaskedInt8x32 + OpOnesCountMaskedInt8x64 + OpOnesCountMaskedInt16x8 + OpOnesCountMaskedInt16x16 + OpOnesCountMaskedInt16x32 + OpOnesCountMaskedInt32x4 + OpOnesCountMaskedInt32x8 + OpOnesCountMaskedInt32x16 + OpOnesCountMaskedInt64x2 + OpOnesCountMaskedInt64x4 + OpOnesCountMaskedInt64x8 + OpOnesCountMaskedUint8x16 + OpOnesCountMaskedUint8x32 + OpOnesCountMaskedUint8x64 + OpOnesCountMaskedUint16x8 + OpOnesCountMaskedUint16x16 + OpOnesCountMaskedUint16x32 + OpOnesCountMaskedUint32x4 + OpOnesCountMaskedUint32x8 + OpOnesCountMaskedUint32x16 + OpOnesCountMaskedUint64x2 + OpOnesCountMaskedUint64x4 + OpOnesCountMaskedUint64x8 + OpOnesCountUint8x16 + OpOnesCountUint8x32 + OpOnesCountUint8x64 + OpOnesCountUint16x8 + OpOnesCountUint16x16 + OpOnesCountUint16x32 + OpOnesCountUint32x4 + OpOnesCountUint32x8 + OpOnesCountUint32x16 + OpOnesCountUint64x2 + OpOnesCountUint64x4 + OpOnesCountUint64x8 OpOrInt8x16 OpOrInt8x32 OpOrInt8x64 @@ -5601,12 +5659,6 @@ const ( OpOrUint64x2 OpOrUint64x4 OpOrUint64x8 - OpPairDotProdInt16x8 - OpPairDotProdInt16x16 - OpPairDotProdInt16x32 - OpPairDotProdMaskedInt16x8 - OpPairDotProdMaskedInt16x16 - OpPairDotProdMaskedInt16x32 OpPermute2Float32x4 OpPermute2Float32x8 OpPermute2Float32x16 @@ -5715,54 +5767,30 @@ const ( OpPermuteUint32x16 OpPermuteUint64x4 OpPermuteUint64x8 - OpPopCountInt8x16 - OpPopCountInt8x32 - OpPopCountInt8x64 - OpPopCountInt16x8 - OpPopCountInt16x16 - OpPopCountInt16x32 - OpPopCountInt32x4 - OpPopCountInt32x8 - OpPopCountInt32x16 - OpPopCountInt64x2 - OpPopCountInt64x4 - OpPopCountInt64x8 - OpPopCountMaskedInt8x16 - OpPopCountMaskedInt8x32 - OpPopCountMaskedInt8x64 - OpPopCountMaskedInt16x8 - OpPopCountMaskedInt16x16 - OpPopCountMaskedInt16x32 - OpPopCountMaskedInt32x4 - OpPopCountMaskedInt32x8 - OpPopCountMaskedInt32x16 - OpPopCountMaskedInt64x2 - OpPopCountMaskedInt64x4 - OpPopCountMaskedInt64x8 - OpPopCountMaskedUint8x16 - OpPopCountMaskedUint8x32 - OpPopCountMaskedUint8x64 - OpPopCountMaskedUint16x8 - OpPopCountMaskedUint16x16 - OpPopCountMaskedUint16x32 - OpPopCountMaskedUint32x4 - OpPopCountMaskedUint32x8 - OpPopCountMaskedUint32x16 - OpPopCountMaskedUint64x2 - OpPopCountMaskedUint64x4 - OpPopCountMaskedUint64x8 - OpPopCountUint8x16 - OpPopCountUint8x32 - OpPopCountUint8x64 - OpPopCountUint16x8 - OpPopCountUint16x16 - OpPopCountUint16x32 - OpPopCountUint32x4 - OpPopCountUint32x8 - OpPopCountUint32x16 - OpPopCountUint64x2 - OpPopCountUint64x4 - OpPopCountUint64x8 + OpReciprocalFloat32x4 + OpReciprocalFloat32x8 + OpReciprocalFloat32x16 + OpReciprocalFloat64x2 + OpReciprocalFloat64x4 + OpReciprocalFloat64x8 + OpReciprocalMaskedFloat32x4 + OpReciprocalMaskedFloat32x8 + OpReciprocalMaskedFloat32x16 + OpReciprocalMaskedFloat64x2 + OpReciprocalMaskedFloat64x4 + OpReciprocalMaskedFloat64x8 + OpReciprocalSqrtFloat32x4 + OpReciprocalSqrtFloat32x8 + OpReciprocalSqrtFloat32x16 + OpReciprocalSqrtFloat64x2 + OpReciprocalSqrtFloat64x4 + OpReciprocalSqrtFloat64x8 + OpReciprocalSqrtMaskedFloat32x4 + OpReciprocalSqrtMaskedFloat32x8 + OpReciprocalSqrtMaskedFloat32x16 + OpReciprocalSqrtMaskedFloat64x2 + OpReciprocalSqrtMaskedFloat64x4 + OpReciprocalSqrtMaskedFloat64x8 OpRotateLeftInt32x4 OpRotateLeftInt32x8 OpRotateLeftInt32x16 @@ -5811,28 +5839,10 @@ const ( OpRotateRightUint64x2 OpRotateRightUint64x4 OpRotateRightUint64x8 - OpRoundFloat32x4 - OpRoundFloat32x8 - OpRoundFloat64x2 - OpRoundFloat64x4 - OpSaturatedAddDotProdInt32x4 - OpSaturatedAddDotProdInt32x8 - OpSaturatedAddDotProdInt32x16 - OpSaturatedAddDotProdMaskedInt32x4 - OpSaturatedAddDotProdMaskedInt32x8 - OpSaturatedAddDotProdMaskedInt32x16 - OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16 - OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32 - OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64 - OpSaturatedUnsignedSignedPairDotProdUint8x16 - OpSaturatedUnsignedSignedPairDotProdUint8x32 - OpSaturatedUnsignedSignedPairDotProdUint8x64 - OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 - OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 - OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 - OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 - OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 - OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 + OpRoundToEvenFloat32x4 + OpRoundToEvenFloat32x8 + OpRoundToEvenFloat64x2 + OpRoundToEvenFloat64x4 OpScaleFloat32x4 OpScaleFloat32x8 OpScaleFloat32x16 @@ -6101,12 +6111,6 @@ const ( OpShiftRightUint64x2 OpShiftRightUint64x4 OpShiftRightUint64x8 - OpSignInt8x16 - OpSignInt8x32 - OpSignInt16x8 - OpSignInt16x16 - OpSignInt32x4 - OpSignInt32x8 OpSqrtFloat32x4 OpSqrtFloat32x8 OpSqrtFloat32x16 @@ -6221,12 +6225,6 @@ const ( OpTruncFloat32x8 OpTruncFloat64x2 OpTruncFloat64x4 - OpUnsignedSignedQuadDotProdAccumulateInt32x4 - OpUnsignedSignedQuadDotProdAccumulateInt32x8 - OpUnsignedSignedQuadDotProdAccumulateInt32x16 - OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 - OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 - OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 OpXorInt8x16 OpXorInt8x32 OpXorInt8x64 @@ -6385,30 +6383,30 @@ const ( OpRotateAllRightUint64x2 OpRotateAllRightUint64x4 OpRotateAllRightUint64x8 - OpRoundScaledFloat32x4 - OpRoundScaledFloat32x8 - OpRoundScaledFloat32x16 - OpRoundScaledFloat64x2 - OpRoundScaledFloat64x4 - OpRoundScaledFloat64x8 - OpRoundScaledMaskedFloat32x4 - OpRoundScaledMaskedFloat32x8 - OpRoundScaledMaskedFloat32x16 - OpRoundScaledMaskedFloat64x2 - OpRoundScaledMaskedFloat64x4 - OpRoundScaledMaskedFloat64x8 - OpRoundScaledResidueFloat32x4 - OpRoundScaledResidueFloat32x8 - OpRoundScaledResidueFloat32x16 - OpRoundScaledResidueFloat64x2 - OpRoundScaledResidueFloat64x4 - OpRoundScaledResidueFloat64x8 - OpRoundScaledResidueMaskedFloat32x4 - OpRoundScaledResidueMaskedFloat32x8 - OpRoundScaledResidueMaskedFloat32x16 - OpRoundScaledResidueMaskedFloat64x2 - OpRoundScaledResidueMaskedFloat64x4 - OpRoundScaledResidueMaskedFloat64x8 + OpRoundToEvenScaledFloat32x4 + OpRoundToEvenScaledFloat32x8 + OpRoundToEvenScaledFloat32x16 + OpRoundToEvenScaledFloat64x2 + OpRoundToEvenScaledFloat64x4 + OpRoundToEvenScaledFloat64x8 + OpRoundToEvenScaledMaskedFloat32x4 + OpRoundToEvenScaledMaskedFloat32x8 + OpRoundToEvenScaledMaskedFloat32x16 + OpRoundToEvenScaledMaskedFloat64x2 + OpRoundToEvenScaledMaskedFloat64x4 + OpRoundToEvenScaledMaskedFloat64x8 + OpRoundToEvenScaledResidueFloat32x4 + OpRoundToEvenScaledResidueFloat32x8 + OpRoundToEvenScaledResidueFloat32x16 + OpRoundToEvenScaledResidueFloat64x2 + OpRoundToEvenScaledResidueFloat64x4 + OpRoundToEvenScaledResidueFloat64x8 + OpRoundToEvenScaledResidueMaskedFloat32x4 + OpRoundToEvenScaledResidueMaskedFloat32x8 + OpRoundToEvenScaledResidueMaskedFloat32x16 + OpRoundToEvenScaledResidueMaskedFloat64x2 + OpRoundToEvenScaledResidueMaskedFloat64x4 + OpRoundToEvenScaledResidueMaskedFloat64x8 OpSetElemInt8x16 OpSetElemInt16x8 OpSetElemInt32x4 @@ -22405,6 +22403,192 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPADDUSB128", + argLen: 2, + commutative: true, + asm: x86.AVPADDUSB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSB256", + argLen: 2, + commutative: true, + asm: x86.AVPADDUSB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSB512", + argLen: 2, + commutative: true, + asm: x86.AVPADDUSB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPADDUSBMasked128", + argLen: 3, + commutative: true, + asm: x86.AVPADDUSB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSBMasked256", + argLen: 3, + commutative: true, + asm: x86.AVPADDUSB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSBMasked512", + argLen: 3, + commutative: true, + asm: x86.AVPADDUSB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSW128", + argLen: 2, + commutative: true, + asm: x86.AVPADDUSW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSW256", + argLen: 2, + commutative: true, + asm: x86.AVPADDUSW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSW512", + argLen: 2, + commutative: true, + asm: x86.AVPADDUSW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPADDUSWMasked128", + argLen: 3, + commutative: true, + asm: x86.AVPADDUSW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSWMasked256", + argLen: 3, + commutative: true, + asm: x86.AVPADDUSW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPADDUSWMasked512", + argLen: 3, + commutative: true, + asm: x86.AVPADDUSW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPADDW128", argLen: 2, @@ -27016,69 +27200,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPMULDQ512", - argLen: 2, - commutative: true, - asm: x86.AVPMULDQ, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, - { - name: "VPMULDQMasked128", - argLen: 3, - commutative: true, - asm: x86.AVPMULDQ, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VPMULDQMasked256", - argLen: 3, - commutative: true, - asm: x86.AVPMULDQ, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VPMULDQMasked512", - argLen: 3, - commutative: true, - asm: x86.AVPMULDQ, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, { name: "VPMULHUW128", argLen: 2, @@ -27109,21 +27230,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPMULHUW512", - argLen: 2, - commutative: true, - asm: x86.AVPMULHUW, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, { name: "VPMULHUWMasked128", argLen: 3, @@ -27140,22 +27246,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPMULHUWMasked256", - argLen: 3, - commutative: true, - asm: x86.AVPMULHUW, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, { name: "VPMULHUWMasked512", argLen: 3, @@ -27172,36 +27262,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPMULHW128", - argLen: 2, - commutative: true, - asm: x86.AVPMULHW, - reg: regInfo{ - inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VPMULHW256", - argLen: 2, - commutative: true, - asm: x86.AVPMULHW, - reg: regInfo{ - inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, { name: "VPMULHW512", argLen: 2, @@ -27217,22 +27277,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPMULHWMasked128", - argLen: 3, - commutative: true, - asm: x86.AVPMULHW, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, { name: "VPMULHWMasked256", argLen: 3, @@ -27249,22 +27293,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPMULHWMasked512", - argLen: 3, - commutative: true, - asm: x86.AVPMULHW, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, { name: "VPMULLD128", argLen: 2, @@ -27574,69 +27602,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPMULUDQ512", - argLen: 2, - commutative: true, - asm: x86.AVPMULUDQ, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, - { - name: "VPMULUDQMasked128", - argLen: 3, - commutative: true, - asm: x86.AVPMULUDQ, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VPMULUDQMasked256", - argLen: 3, - commutative: true, - asm: x86.AVPMULUDQ, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VPMULUDQMasked512", - argLen: 3, - commutative: true, - asm: x86.AVPMULUDQ, - reg: regInfo{ - inputs: []inputInfo{ - {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, { name: "VPOPCNTB128", argLen: 1, @@ -31144,6 +31109,180 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPSUBUSB128", + argLen: 2, + asm: x86.AVPSUBUSB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSB256", + argLen: 2, + asm: x86.AVPSUBUSB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSB512", + argLen: 2, + asm: x86.AVPSUBUSB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSUBUSBMasked128", + argLen: 3, + asm: x86.AVPSUBUSB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSBMasked256", + argLen: 3, + asm: x86.AVPSUBUSB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSBMasked512", + argLen: 3, + asm: x86.AVPSUBUSB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSW128", + argLen: 2, + asm: x86.AVPSUBUSW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSW256", + argLen: 2, + asm: x86.AVPSUBUSW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSW512", + argLen: 2, + asm: x86.AVPSUBUSW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSUBUSWMasked128", + argLen: 3, + asm: x86.AVPSUBUSW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSWMasked256", + argLen: 3, + asm: x86.AVPSUBUSW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSUBUSWMasked512", + argLen: 3, + asm: x86.AVPSUBUSW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSUBW128", argLen: 2, @@ -32625,54 +32764,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VDPPS128", - auxType: auxInt8, - argLen: 2, - commutative: true, - asm: x86.AVDPPS, - reg: regInfo{ - inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VDPPS256", - auxType: auxInt8, - argLen: 2, - commutative: true, - asm: x86.AVDPPS, - reg: regInfo{ - inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VDPPD128", - auxType: auxInt8, - argLen: 2, - commutative: true, - asm: x86.AVDPPD, - reg: regInfo{ - inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, { name: "VCMPPS128", auxType: auxInt8, @@ -63258,152 +63349,212 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "AbsoluteInt8x16", + name: "AbsInt8x16", argLen: 1, generic: true, }, { - name: "AbsoluteInt8x32", + name: "AbsInt8x32", argLen: 1, generic: true, }, { - name: "AbsoluteInt8x64", + name: "AbsInt8x64", argLen: 1, generic: true, }, { - name: "AbsoluteInt16x8", + name: "AbsInt16x8", argLen: 1, generic: true, }, { - name: "AbsoluteInt16x16", + name: "AbsInt16x16", argLen: 1, generic: true, }, { - name: "AbsoluteInt16x32", + name: "AbsInt16x32", argLen: 1, generic: true, }, { - name: "AbsoluteInt32x4", + name: "AbsInt32x4", argLen: 1, generic: true, }, { - name: "AbsoluteInt32x8", + name: "AbsInt32x8", argLen: 1, generic: true, }, { - name: "AbsoluteInt32x16", + name: "AbsInt32x16", argLen: 1, generic: true, }, { - name: "AbsoluteInt64x2", + name: "AbsInt64x2", argLen: 1, generic: true, }, { - name: "AbsoluteInt64x4", + name: "AbsInt64x4", argLen: 1, generic: true, }, { - name: "AbsoluteInt64x8", + name: "AbsInt64x8", argLen: 1, generic: true, }, { - name: "AbsoluteMaskedInt8x16", + name: "AbsMaskedInt8x16", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt8x32", + name: "AbsMaskedInt8x32", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt8x64", + name: "AbsMaskedInt8x64", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt16x8", + name: "AbsMaskedInt16x8", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt16x16", + name: "AbsMaskedInt16x16", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt16x32", + name: "AbsMaskedInt16x32", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt32x4", + name: "AbsMaskedInt32x4", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt32x8", + name: "AbsMaskedInt32x8", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt32x16", + name: "AbsMaskedInt32x16", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt64x2", + name: "AbsMaskedInt64x2", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt64x4", + name: "AbsMaskedInt64x4", argLen: 2, generic: true, }, { - name: "AbsoluteMaskedInt64x8", + name: "AbsMaskedInt64x8", argLen: 2, generic: true, }, { - name: "AddDotProdInt32x4", + name: "AddDotProdPairsSaturatedInt32x4", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdPairsSaturatedInt32x8", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdPairsSaturatedInt32x16", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdPairsSaturatedMaskedInt32x4", + argLen: 4, + generic: true, + }, + { + name: "AddDotProdPairsSaturatedMaskedInt32x8", + argLen: 4, + generic: true, + }, + { + name: "AddDotProdPairsSaturatedMaskedInt32x16", + argLen: 4, + generic: true, + }, + { + name: "AddDotProdQuadrupleInt32x4", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdQuadrupleInt32x8", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdQuadrupleInt32x16", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdQuadrupleMaskedInt32x4", + argLen: 4, + generic: true, + }, + { + name: "AddDotProdQuadrupleMaskedInt32x8", + argLen: 4, + generic: true, + }, + { + name: "AddDotProdQuadrupleMaskedInt32x16", + argLen: 4, + generic: true, + }, + { + name: "AddDotProdQuadrupleSaturatedInt32x4", argLen: 3, generic: true, }, { - name: "AddDotProdInt32x8", + name: "AddDotProdQuadrupleSaturatedInt32x8", argLen: 3, generic: true, }, { - name: "AddDotProdInt32x16", + name: "AddDotProdQuadrupleSaturatedInt32x16", argLen: 3, generic: true, }, { - name: "AddDotProdMaskedInt32x4", + name: "AddDotProdQuadrupleSaturatedMaskedInt32x4", argLen: 4, generic: true, }, { - name: "AddDotProdMaskedInt32x8", + name: "AddDotProdQuadrupleSaturatedMaskedInt32x8", argLen: 4, generic: true, }, { - name: "AddDotProdMaskedInt32x16", + name: "AddDotProdQuadrupleSaturatedMaskedInt32x16", argLen: 4, generic: true, }, @@ -64397,126 +64548,6 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, - { - name: "ApproximateReciprocalFloat32x4", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalFloat32x8", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalFloat32x16", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalFloat64x2", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalFloat64x4", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalFloat64x8", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalMaskedFloat32x4", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalMaskedFloat32x8", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalMaskedFloat32x16", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalMaskedFloat64x2", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalMaskedFloat64x4", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalMaskedFloat64x8", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtFloat32x4", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtFloat32x8", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtFloat32x16", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtFloat64x2", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtFloat64x4", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtFloat64x8", - argLen: 1, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtMaskedFloat32x4", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtMaskedFloat32x8", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtMaskedFloat32x16", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtMaskedFloat64x2", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtMaskedFloat64x4", - argLen: 2, - generic: true, - }, - { - name: "ApproximateReciprocalOfSqrtMaskedFloat64x8", - argLen: 2, - generic: true, - }, { name: "AverageMaskedUint8x16", argLen: 3, @@ -64819,6 +64850,36 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "CopySignInt8x16", + argLen: 2, + generic: true, + }, + { + name: "CopySignInt8x32", + argLen: 2, + generic: true, + }, + { + name: "CopySignInt16x8", + argLen: 2, + generic: true, + }, + { + name: "CopySignInt16x16", + argLen: 2, + generic: true, + }, + { + name: "CopySignInt32x4", + argLen: 2, + generic: true, + }, + { + name: "CopySignInt32x8", + argLen: 2, + generic: true, + }, { name: "DivFloat32x4", argLen: 2, @@ -64880,22 +64941,64 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "DotProdBroadcastFloat32x4", - argLen: 2, - commutative: true, - generic: true, + name: "DotProdPairsInt16x8", + argLen: 2, + generic: true, }, { - name: "DotProdBroadcastFloat32x8", - argLen: 2, - commutative: true, - generic: true, + name: "DotProdPairsInt16x16", + argLen: 2, + generic: true, }, { - name: "DotProdBroadcastFloat64x2", - argLen: 2, - commutative: true, - generic: true, + name: "DotProdPairsInt16x32", + argLen: 2, + generic: true, + }, + { + name: "DotProdPairsMaskedInt16x8", + argLen: 3, + generic: true, + }, + { + name: "DotProdPairsMaskedInt16x16", + argLen: 3, + generic: true, + }, + { + name: "DotProdPairsMaskedInt16x32", + argLen: 3, + generic: true, + }, + { + name: "DotProdPairsSaturatedMaskedUint8x16", + argLen: 3, + generic: true, + }, + { + name: "DotProdPairsSaturatedMaskedUint8x32", + argLen: 3, + generic: true, + }, + { + name: "DotProdPairsSaturatedMaskedUint8x64", + argLen: 3, + generic: true, + }, + { + name: "DotProdPairsSaturatedUint8x16", + argLen: 2, + generic: true, + }, + { + name: "DotProdPairsSaturatedUint8x32", + argLen: 2, + generic: true, + }, + { + name: "DotProdPairsSaturatedUint8x64", + argLen: 2, + generic: true, }, { name: "EqualFloat32x4", @@ -65427,186 +65530,6 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, - { - name: "FusedMultiplyAddFloat32x4", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddFloat32x8", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddFloat32x16", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddFloat64x2", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddFloat64x4", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddFloat64x8", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddMaskedFloat32x4", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddMaskedFloat32x8", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddMaskedFloat32x16", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddMaskedFloat64x2", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddMaskedFloat64x4", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddMaskedFloat64x8", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddSubFloat32x4", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddSubFloat32x8", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddSubFloat32x16", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddSubFloat64x2", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddSubFloat64x4", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddSubFloat64x8", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplyAddSubMaskedFloat32x4", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddSubMaskedFloat32x8", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddSubMaskedFloat32x16", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddSubMaskedFloat64x2", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddSubMaskedFloat64x4", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplyAddSubMaskedFloat64x8", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplySubAddFloat32x4", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplySubAddFloat32x8", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplySubAddFloat32x16", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplySubAddFloat64x2", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplySubAddFloat64x4", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplySubAddFloat64x8", - argLen: 3, - generic: true, - }, - { - name: "FusedMultiplySubAddMaskedFloat32x4", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplySubAddMaskedFloat32x8", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplySubAddMaskedFloat32x16", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplySubAddMaskedFloat64x2", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplySubAddMaskedFloat64x4", - argLen: 4, - generic: true, - }, - { - name: "FusedMultiplySubAddMaskedFloat64x8", - argLen: 4, - generic: true, - }, { name: "GaloisFieldMulMaskedUint8x16", argLen: 3, @@ -67829,6 +67752,126 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "MulAddFloat32x4", + argLen: 3, + generic: true, + }, + { + name: "MulAddFloat32x8", + argLen: 3, + generic: true, + }, + { + name: "MulAddFloat32x16", + argLen: 3, + generic: true, + }, + { + name: "MulAddFloat64x2", + argLen: 3, + generic: true, + }, + { + name: "MulAddFloat64x4", + argLen: 3, + generic: true, + }, + { + name: "MulAddFloat64x8", + argLen: 3, + generic: true, + }, + { + name: "MulAddMaskedFloat32x4", + argLen: 4, + generic: true, + }, + { + name: "MulAddMaskedFloat32x8", + argLen: 4, + generic: true, + }, + { + name: "MulAddMaskedFloat32x16", + argLen: 4, + generic: true, + }, + { + name: "MulAddMaskedFloat64x2", + argLen: 4, + generic: true, + }, + { + name: "MulAddMaskedFloat64x4", + argLen: 4, + generic: true, + }, + { + name: "MulAddMaskedFloat64x8", + argLen: 4, + generic: true, + }, + { + name: "MulAddSubFloat32x4", + argLen: 3, + generic: true, + }, + { + name: "MulAddSubFloat32x8", + argLen: 3, + generic: true, + }, + { + name: "MulAddSubFloat32x16", + argLen: 3, + generic: true, + }, + { + name: "MulAddSubFloat64x2", + argLen: 3, + generic: true, + }, + { + name: "MulAddSubFloat64x4", + argLen: 3, + generic: true, + }, + { + name: "MulAddSubFloat64x8", + argLen: 3, + generic: true, + }, + { + name: "MulAddSubMaskedFloat32x4", + argLen: 4, + generic: true, + }, + { + name: "MulAddSubMaskedFloat32x8", + argLen: 4, + generic: true, + }, + { + name: "MulAddSubMaskedFloat32x16", + argLen: 4, + generic: true, + }, + { + name: "MulAddSubMaskedFloat64x2", + argLen: 4, + generic: true, + }, + { + name: "MulAddSubMaskedFloat64x4", + argLen: 4, + generic: true, + }, + { + name: "MulAddSubMaskedFloat64x8", + argLen: 4, + generic: true, + }, { name: "MulEvenWidenInt32x4", argLen: 2, @@ -67842,338 +67885,398 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "MulEvenWidenInt64x2", + name: "MulEvenWidenUint32x4", argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenInt64x4", + name: "MulEvenWidenUint32x8", argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenInt64x8", + name: "MulFloat32x4", argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenMaskedInt64x2", - argLen: 3, + name: "MulFloat32x8", + argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenMaskedInt64x4", - argLen: 3, + name: "MulFloat32x16", + argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenMaskedInt64x8", - argLen: 3, + name: "MulFloat64x2", + argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenMaskedUint64x2", - argLen: 3, + name: "MulFloat64x4", + argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenMaskedUint64x4", - argLen: 3, + name: "MulFloat64x8", + argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenMaskedUint64x8", - argLen: 3, + name: "MulHighInt16x8", + argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenUint32x4", + name: "MulHighInt16x16", argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenUint32x8", + name: "MulHighInt16x32", argLen: 2, commutative: true, generic: true, }, { - name: "MulEvenWidenUint64x2", - argLen: 2, + name: "MulHighMaskedInt16x8", + argLen: 3, commutative: true, generic: true, }, { - name: "MulEvenWidenUint64x4", - argLen: 2, + name: "MulHighMaskedInt16x16", + argLen: 3, commutative: true, generic: true, }, { - name: "MulEvenWidenUint64x8", - argLen: 2, + name: "MulHighMaskedInt16x32", + argLen: 3, commutative: true, generic: true, }, { - name: "MulFloat32x4", + name: "MulInt16x8", argLen: 2, commutative: true, generic: true, }, { - name: "MulFloat32x8", + name: "MulInt16x16", argLen: 2, commutative: true, generic: true, }, { - name: "MulFloat32x16", + name: "MulInt16x32", argLen: 2, commutative: true, generic: true, }, { - name: "MulFloat64x2", + name: "MulInt32x4", argLen: 2, commutative: true, generic: true, }, { - name: "MulFloat64x4", + name: "MulInt32x8", argLen: 2, commutative: true, generic: true, }, { - name: "MulFloat64x8", + name: "MulInt32x16", argLen: 2, commutative: true, generic: true, }, { - name: "MulHighInt16x8", + name: "MulInt64x2", argLen: 2, commutative: true, generic: true, }, { - name: "MulHighInt16x16", + name: "MulInt64x4", argLen: 2, commutative: true, generic: true, }, { - name: "MulHighInt16x32", + name: "MulInt64x8", argLen: 2, commutative: true, generic: true, }, { - name: "MulHighMaskedInt16x8", + name: "MulMaskedFloat32x4", argLen: 3, commutative: true, generic: true, }, { - name: "MulHighMaskedInt16x16", + name: "MulMaskedFloat32x8", argLen: 3, commutative: true, generic: true, }, { - name: "MulHighMaskedInt16x32", + name: "MulMaskedFloat32x16", argLen: 3, commutative: true, generic: true, }, { - name: "MulHighMaskedUint16x8", + name: "MulMaskedFloat64x2", argLen: 3, commutative: true, generic: true, }, { - name: "MulHighMaskedUint16x16", + name: "MulMaskedFloat64x4", argLen: 3, commutative: true, generic: true, }, { - name: "MulHighMaskedUint16x32", + name: "MulMaskedFloat64x8", argLen: 3, commutative: true, generic: true, }, { - name: "MulHighUint16x8", - argLen: 2, + name: "MulMaskedInt16x8", + argLen: 3, commutative: true, generic: true, }, { - name: "MulHighUint16x16", - argLen: 2, + name: "MulMaskedInt16x16", + argLen: 3, commutative: true, generic: true, }, { - name: "MulHighUint16x32", - argLen: 2, + name: "MulMaskedInt16x32", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt16x8", - argLen: 2, + name: "MulMaskedInt32x4", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt16x16", - argLen: 2, + name: "MulMaskedInt32x8", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt16x32", - argLen: 2, + name: "MulMaskedInt32x16", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt32x4", - argLen: 2, + name: "MulMaskedInt64x2", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt32x8", - argLen: 2, + name: "MulMaskedInt64x4", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt32x16", - argLen: 2, + name: "MulMaskedInt64x8", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt64x2", - argLen: 2, + name: "MulMaskedUint16x8", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt64x4", - argLen: 2, + name: "MulMaskedUint16x16", + argLen: 3, commutative: true, generic: true, }, { - name: "MulInt64x8", - argLen: 2, + name: "MulMaskedUint16x32", + argLen: 3, commutative: true, generic: true, }, { - name: "MulMaskedFloat32x4", + name: "MulMaskedUint32x4", argLen: 3, commutative: true, generic: true, }, { - name: "MulMaskedFloat32x8", + name: "MulMaskedUint32x8", argLen: 3, commutative: true, generic: true, }, { - name: "MulMaskedFloat32x16", + name: "MulMaskedUint32x16", argLen: 3, commutative: true, generic: true, }, { - name: "MulMaskedFloat64x2", + name: "MulMaskedUint64x2", argLen: 3, commutative: true, generic: true, }, { - name: "MulMaskedFloat64x4", + name: "MulMaskedUint64x4", argLen: 3, commutative: true, generic: true, }, { - name: "MulMaskedFloat64x8", + name: "MulMaskedUint64x8", argLen: 3, commutative: true, generic: true, }, { - name: "MulMaskedInt16x8", - argLen: 3, + name: "MulSubAddFloat32x4", + argLen: 3, + generic: true, + }, + { + name: "MulSubAddFloat32x8", + argLen: 3, + generic: true, + }, + { + name: "MulSubAddFloat32x16", + argLen: 3, + generic: true, + }, + { + name: "MulSubAddFloat64x2", + argLen: 3, + generic: true, + }, + { + name: "MulSubAddFloat64x4", + argLen: 3, + generic: true, + }, + { + name: "MulSubAddFloat64x8", + argLen: 3, + generic: true, + }, + { + name: "MulSubAddMaskedFloat32x4", + argLen: 4, + generic: true, + }, + { + name: "MulSubAddMaskedFloat32x8", + argLen: 4, + generic: true, + }, + { + name: "MulSubAddMaskedFloat32x16", + argLen: 4, + generic: true, + }, + { + name: "MulSubAddMaskedFloat64x2", + argLen: 4, + generic: true, + }, + { + name: "MulSubAddMaskedFloat64x4", + argLen: 4, + generic: true, + }, + { + name: "MulSubAddMaskedFloat64x8", + argLen: 4, + generic: true, + }, + { + name: "MulUint16x8", + argLen: 2, commutative: true, generic: true, }, { - name: "MulMaskedInt16x16", - argLen: 3, + name: "MulUint16x16", + argLen: 2, commutative: true, generic: true, }, { - name: "MulMaskedInt16x32", - argLen: 3, + name: "MulUint16x32", + argLen: 2, commutative: true, generic: true, }, { - name: "MulMaskedInt32x4", - argLen: 3, + name: "MulUint32x4", + argLen: 2, commutative: true, generic: true, }, { - name: "MulMaskedInt32x8", - argLen: 3, + name: "MulUint32x8", + argLen: 2, commutative: true, generic: true, }, { - name: "MulMaskedInt32x16", - argLen: 3, + name: "MulUint32x16", + argLen: 2, commutative: true, generic: true, }, { - name: "MulMaskedInt64x2", - argLen: 3, + name: "MulUint64x2", + argLen: 2, commutative: true, generic: true, }, { - name: "MulMaskedInt64x4", - argLen: 3, + name: "MulUint64x4", + argLen: 2, commutative: true, generic: true, }, { - name: "MulMaskedInt64x8", - argLen: 3, + name: "MulUint64x8", + argLen: 2, commutative: true, generic: true, }, @@ -68537,6 +68640,246 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "OnesCountInt8x16", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt8x32", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt8x64", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt16x8", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt16x16", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt16x32", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt32x4", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt32x8", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt32x16", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt64x2", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt64x4", + argLen: 1, + generic: true, + }, + { + name: "OnesCountInt64x8", + argLen: 1, + generic: true, + }, + { + name: "OnesCountMaskedInt8x16", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt8x32", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt8x64", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt16x8", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt16x16", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt16x32", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt32x4", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt32x8", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt32x16", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt64x2", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt64x4", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedInt64x8", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint8x16", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint8x32", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint8x64", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint16x8", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint16x16", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint16x32", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint32x4", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint32x8", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint32x16", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint64x2", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint64x4", + argLen: 2, + generic: true, + }, + { + name: "OnesCountMaskedUint64x8", + argLen: 2, + generic: true, + }, + { + name: "OnesCountUint8x16", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint8x32", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint8x64", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint16x8", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint16x16", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint16x32", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint32x4", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint32x8", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint32x16", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint64x2", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint64x4", + argLen: 1, + generic: true, + }, + { + name: "OnesCountUint64x8", + argLen: 1, + generic: true, + }, { name: "OrInt8x16", argLen: 2, @@ -68753,36 +69096,6 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, - { - name: "PairDotProdInt16x8", - argLen: 2, - generic: true, - }, - { - name: "PairDotProdInt16x16", - argLen: 2, - generic: true, - }, - { - name: "PairDotProdInt16x32", - argLen: 2, - generic: true, - }, - { - name: "PairDotProdMaskedInt16x8", - argLen: 3, - generic: true, - }, - { - name: "PairDotProdMaskedInt16x16", - argLen: 3, - generic: true, - }, - { - name: "PairDotProdMaskedInt16x32", - argLen: 3, - generic: true, - }, { name: "Permute2Float32x4", argLen: 3, @@ -69324,243 +69637,123 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PopCountInt8x16", - argLen: 1, - generic: true, - }, - { - name: "PopCountInt8x32", - argLen: 1, - generic: true, - }, - { - name: "PopCountInt8x64", - argLen: 1, - generic: true, - }, - { - name: "PopCountInt16x8", - argLen: 1, - generic: true, - }, - { - name: "PopCountInt16x16", - argLen: 1, - generic: true, - }, - { - name: "PopCountInt16x32", + name: "ReciprocalFloat32x4", argLen: 1, generic: true, }, { - name: "PopCountInt32x4", + name: "ReciprocalFloat32x8", argLen: 1, generic: true, }, { - name: "PopCountInt32x8", + name: "ReciprocalFloat32x16", argLen: 1, generic: true, }, { - name: "PopCountInt32x16", + name: "ReciprocalFloat64x2", argLen: 1, generic: true, }, { - name: "PopCountInt64x2", + name: "ReciprocalFloat64x4", argLen: 1, generic: true, }, { - name: "PopCountInt64x4", + name: "ReciprocalFloat64x8", argLen: 1, generic: true, }, { - name: "PopCountInt64x8", - argLen: 1, - generic: true, - }, - { - name: "PopCountMaskedInt8x16", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedInt8x32", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedInt8x64", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedInt16x8", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedInt16x16", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedInt16x32", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedInt32x4", + name: "ReciprocalMaskedFloat32x4", argLen: 2, generic: true, }, { - name: "PopCountMaskedInt32x8", + name: "ReciprocalMaskedFloat32x8", argLen: 2, generic: true, }, { - name: "PopCountMaskedInt32x16", + name: "ReciprocalMaskedFloat32x16", argLen: 2, generic: true, }, { - name: "PopCountMaskedInt64x2", + name: "ReciprocalMaskedFloat64x2", argLen: 2, generic: true, }, { - name: "PopCountMaskedInt64x4", + name: "ReciprocalMaskedFloat64x4", argLen: 2, generic: true, }, { - name: "PopCountMaskedInt64x8", + name: "ReciprocalMaskedFloat64x8", argLen: 2, generic: true, }, { - name: "PopCountMaskedUint8x16", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint8x32", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint8x64", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint16x8", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint16x16", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint16x32", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint32x4", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint32x8", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint32x16", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint64x2", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint64x4", - argLen: 2, - generic: true, - }, - { - name: "PopCountMaskedUint64x8", - argLen: 2, - generic: true, - }, - { - name: "PopCountUint8x16", + name: "ReciprocalSqrtFloat32x4", argLen: 1, generic: true, }, { - name: "PopCountUint8x32", + name: "ReciprocalSqrtFloat32x8", argLen: 1, generic: true, }, { - name: "PopCountUint8x64", + name: "ReciprocalSqrtFloat32x16", argLen: 1, generic: true, }, { - name: "PopCountUint16x8", + name: "ReciprocalSqrtFloat64x2", argLen: 1, generic: true, }, { - name: "PopCountUint16x16", + name: "ReciprocalSqrtFloat64x4", argLen: 1, generic: true, }, { - name: "PopCountUint16x32", + name: "ReciprocalSqrtFloat64x8", argLen: 1, generic: true, }, { - name: "PopCountUint32x4", - argLen: 1, + name: "ReciprocalSqrtMaskedFloat32x4", + argLen: 2, generic: true, }, { - name: "PopCountUint32x8", - argLen: 1, + name: "ReciprocalSqrtMaskedFloat32x8", + argLen: 2, generic: true, }, { - name: "PopCountUint32x16", - argLen: 1, + name: "ReciprocalSqrtMaskedFloat32x16", + argLen: 2, generic: true, }, { - name: "PopCountUint64x2", - argLen: 1, + name: "ReciprocalSqrtMaskedFloat64x2", + argLen: 2, generic: true, }, { - name: "PopCountUint64x4", - argLen: 1, + name: "ReciprocalSqrtMaskedFloat64x4", + argLen: 2, generic: true, }, { - name: "PopCountUint64x8", - argLen: 1, + name: "ReciprocalSqrtMaskedFloat64x8", + argLen: 2, generic: true, }, { @@ -69804,115 +69997,25 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "RoundFloat32x4", + name: "RoundToEvenFloat32x4", argLen: 1, generic: true, }, { - name: "RoundFloat32x8", + name: "RoundToEvenFloat32x8", argLen: 1, generic: true, }, { - name: "RoundFloat64x2", + name: "RoundToEvenFloat64x2", argLen: 1, generic: true, }, { - name: "RoundFloat64x4", + name: "RoundToEvenFloat64x4", argLen: 1, generic: true, }, - { - name: "SaturatedAddDotProdInt32x4", - argLen: 3, - generic: true, - }, - { - name: "SaturatedAddDotProdInt32x8", - argLen: 3, - generic: true, - }, - { - name: "SaturatedAddDotProdInt32x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedAddDotProdMaskedInt32x4", - argLen: 4, - generic: true, - }, - { - name: "SaturatedAddDotProdMaskedInt32x8", - argLen: 4, - generic: true, - }, - { - name: "SaturatedAddDotProdMaskedInt32x16", - argLen: 4, - generic: true, - }, - { - name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x32", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x64", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedPairDotProdUint8x16", - argLen: 2, - generic: true, - }, - { - name: "SaturatedUnsignedSignedPairDotProdUint8x32", - argLen: 2, - generic: true, - }, - { - name: "SaturatedUnsignedSignedPairDotProdUint8x64", - argLen: 2, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4", - argLen: 4, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8", - argLen: 4, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16", - argLen: 4, - generic: true, - }, { name: "ScaleFloat32x4", argLen: 2, @@ -71253,36 +71356,6 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, - { - name: "SignInt8x16", - argLen: 2, - generic: true, - }, - { - name: "SignInt8x32", - argLen: 2, - generic: true, - }, - { - name: "SignInt16x8", - argLen: 2, - generic: true, - }, - { - name: "SignInt16x16", - argLen: 2, - generic: true, - }, - { - name: "SignInt32x4", - argLen: 2, - generic: true, - }, - { - name: "SignInt32x8", - argLen: 2, - generic: true, - }, { name: "SqrtFloat32x4", argLen: 1, @@ -71853,36 +71926,6 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, - { - name: "UnsignedSignedQuadDotProdAccumulateInt32x4", - argLen: 3, - generic: true, - }, - { - name: "UnsignedSignedQuadDotProdAccumulateInt32x8", - argLen: 3, - generic: true, - }, - { - name: "UnsignedSignedQuadDotProdAccumulateInt32x16", - argLen: 3, - generic: true, - }, - { - name: "UnsignedSignedQuadDotProdAccumulateMaskedInt32x4", - argLen: 4, - generic: true, - }, - { - name: "UnsignedSignedQuadDotProdAccumulateMaskedInt32x8", - argLen: 4, - generic: true, - }, - { - name: "UnsignedSignedQuadDotProdAccumulateMaskedInt32x16", - argLen: 4, - generic: true, - }, { name: "XorInt8x16", argLen: 2, @@ -72826,145 +72869,145 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "RoundScaledFloat32x4", + name: "RoundToEvenScaledFloat32x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledFloat32x8", + name: "RoundToEvenScaledFloat32x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledFloat32x16", + name: "RoundToEvenScaledFloat32x16", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledFloat64x2", + name: "RoundToEvenScaledFloat64x2", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledFloat64x4", + name: "RoundToEvenScaledFloat64x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledFloat64x8", + name: "RoundToEvenScaledFloat64x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledMaskedFloat32x4", + name: "RoundToEvenScaledMaskedFloat32x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledMaskedFloat32x8", + name: "RoundToEvenScaledMaskedFloat32x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledMaskedFloat32x16", + name: "RoundToEvenScaledMaskedFloat32x16", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledMaskedFloat64x2", + name: "RoundToEvenScaledMaskedFloat64x2", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledMaskedFloat64x4", + name: "RoundToEvenScaledMaskedFloat64x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledMaskedFloat64x8", + name: "RoundToEvenScaledMaskedFloat64x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledResidueFloat32x4", + name: "RoundToEvenScaledResidueFloat32x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledResidueFloat32x8", + name: "RoundToEvenScaledResidueFloat32x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledResidueFloat32x16", + name: "RoundToEvenScaledResidueFloat32x16", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledResidueFloat64x2", + name: "RoundToEvenScaledResidueFloat64x2", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledResidueFloat64x4", + name: "RoundToEvenScaledResidueFloat64x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledResidueFloat64x8", + name: "RoundToEvenScaledResidueFloat64x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundScaledResidueMaskedFloat32x4", + name: "RoundToEvenScaledResidueMaskedFloat32x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledResidueMaskedFloat32x8", + name: "RoundToEvenScaledResidueMaskedFloat32x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledResidueMaskedFloat32x16", + name: "RoundToEvenScaledResidueMaskedFloat32x16", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledResidueMaskedFloat64x2", + name: "RoundToEvenScaledResidueMaskedFloat64x2", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledResidueMaskedFloat64x4", + name: "RoundToEvenScaledResidueMaskedFloat64x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundScaledResidueMaskedFloat64x8", + name: "RoundToEvenScaledResidueMaskedFloat64x8", auxType: auxInt8, argLen: 2, generic: true, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index eacb30768f..20d014361e 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -559,66 +559,66 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64XORQload(v) case OpAMD64XORQmodify: return rewriteValueAMD64_OpAMD64XORQmodify(v) - case OpAbsoluteInt16x16: + case OpAbsInt16x16: v.Op = OpAMD64VPABSW256 return true - case OpAbsoluteInt16x32: + case OpAbsInt16x32: v.Op = OpAMD64VPABSW512 return true - case OpAbsoluteInt16x8: + case OpAbsInt16x8: v.Op = OpAMD64VPABSW128 return true - case OpAbsoluteInt32x16: + case OpAbsInt32x16: v.Op = OpAMD64VPABSD512 return true - case OpAbsoluteInt32x4: + case OpAbsInt32x4: v.Op = OpAMD64VPABSD128 return true - case OpAbsoluteInt32x8: + case OpAbsInt32x8: v.Op = OpAMD64VPABSD256 return true - case OpAbsoluteInt64x2: + case OpAbsInt64x2: v.Op = OpAMD64VPABSQ128 return true - case OpAbsoluteInt64x4: + case OpAbsInt64x4: v.Op = OpAMD64VPABSQ256 return true - case OpAbsoluteInt64x8: + case OpAbsInt64x8: v.Op = OpAMD64VPABSQ512 return true - case OpAbsoluteInt8x16: + case OpAbsInt8x16: v.Op = OpAMD64VPABSB128 return true - case OpAbsoluteInt8x32: + case OpAbsInt8x32: v.Op = OpAMD64VPABSB256 return true - case OpAbsoluteInt8x64: + case OpAbsInt8x64: v.Op = OpAMD64VPABSB512 return true - case OpAbsoluteMaskedInt16x16: - return rewriteValueAMD64_OpAbsoluteMaskedInt16x16(v) - case OpAbsoluteMaskedInt16x32: - return rewriteValueAMD64_OpAbsoluteMaskedInt16x32(v) - case OpAbsoluteMaskedInt16x8: - return rewriteValueAMD64_OpAbsoluteMaskedInt16x8(v) - case OpAbsoluteMaskedInt32x16: - return rewriteValueAMD64_OpAbsoluteMaskedInt32x16(v) - case OpAbsoluteMaskedInt32x4: - return rewriteValueAMD64_OpAbsoluteMaskedInt32x4(v) - case OpAbsoluteMaskedInt32x8: - return rewriteValueAMD64_OpAbsoluteMaskedInt32x8(v) - case OpAbsoluteMaskedInt64x2: - return rewriteValueAMD64_OpAbsoluteMaskedInt64x2(v) - case OpAbsoluteMaskedInt64x4: - return rewriteValueAMD64_OpAbsoluteMaskedInt64x4(v) - case OpAbsoluteMaskedInt64x8: - return rewriteValueAMD64_OpAbsoluteMaskedInt64x8(v) - case OpAbsoluteMaskedInt8x16: - return rewriteValueAMD64_OpAbsoluteMaskedInt8x16(v) - case OpAbsoluteMaskedInt8x32: - return rewriteValueAMD64_OpAbsoluteMaskedInt8x32(v) - case OpAbsoluteMaskedInt8x64: - return rewriteValueAMD64_OpAbsoluteMaskedInt8x64(v) + case OpAbsMaskedInt16x16: + return rewriteValueAMD64_OpAbsMaskedInt16x16(v) + case OpAbsMaskedInt16x32: + return rewriteValueAMD64_OpAbsMaskedInt16x32(v) + case OpAbsMaskedInt16x8: + return rewriteValueAMD64_OpAbsMaskedInt16x8(v) + case OpAbsMaskedInt32x16: + return rewriteValueAMD64_OpAbsMaskedInt32x16(v) + case OpAbsMaskedInt32x4: + return rewriteValueAMD64_OpAbsMaskedInt32x4(v) + case OpAbsMaskedInt32x8: + return rewriteValueAMD64_OpAbsMaskedInt32x8(v) + case OpAbsMaskedInt64x2: + return rewriteValueAMD64_OpAbsMaskedInt64x2(v) + case OpAbsMaskedInt64x4: + return rewriteValueAMD64_OpAbsMaskedInt64x4(v) + case OpAbsMaskedInt64x8: + return rewriteValueAMD64_OpAbsMaskedInt64x8(v) + case OpAbsMaskedInt8x16: + return rewriteValueAMD64_OpAbsMaskedInt8x16(v) + case OpAbsMaskedInt8x32: + return rewriteValueAMD64_OpAbsMaskedInt8x32(v) + case OpAbsMaskedInt8x64: + return rewriteValueAMD64_OpAbsMaskedInt8x64(v) case OpAdd16: v.Op = OpAMD64ADDL return true @@ -637,21 +637,51 @@ func rewriteValueAMD64(v *Value) bool { case OpAdd8: v.Op = OpAMD64ADDL return true - case OpAddDotProdInt32x16: - v.Op = OpAMD64VPDPWSSD512 + case OpAddDotProdPairsSaturatedInt32x16: + v.Op = OpAMD64VPDPWSSDS512 + return true + case OpAddDotProdPairsSaturatedInt32x4: + v.Op = OpAMD64VPDPWSSDS128 + return true + case OpAddDotProdPairsSaturatedInt32x8: + v.Op = OpAMD64VPDPWSSDS256 + return true + case OpAddDotProdPairsSaturatedMaskedInt32x16: + return rewriteValueAMD64_OpAddDotProdPairsSaturatedMaskedInt32x16(v) + case OpAddDotProdPairsSaturatedMaskedInt32x4: + return rewriteValueAMD64_OpAddDotProdPairsSaturatedMaskedInt32x4(v) + case OpAddDotProdPairsSaturatedMaskedInt32x8: + return rewriteValueAMD64_OpAddDotProdPairsSaturatedMaskedInt32x8(v) + case OpAddDotProdQuadrupleInt32x16: + v.Op = OpAMD64VPDPBUSD512 + return true + case OpAddDotProdQuadrupleInt32x4: + v.Op = OpAMD64VPDPBUSD128 return true - case OpAddDotProdInt32x4: - v.Op = OpAMD64VPDPWSSD128 + case OpAddDotProdQuadrupleInt32x8: + v.Op = OpAMD64VPDPBUSD256 return true - case OpAddDotProdInt32x8: - v.Op = OpAMD64VPDPWSSD256 + case OpAddDotProdQuadrupleMaskedInt32x16: + return rewriteValueAMD64_OpAddDotProdQuadrupleMaskedInt32x16(v) + case OpAddDotProdQuadrupleMaskedInt32x4: + return rewriteValueAMD64_OpAddDotProdQuadrupleMaskedInt32x4(v) + case OpAddDotProdQuadrupleMaskedInt32x8: + return rewriteValueAMD64_OpAddDotProdQuadrupleMaskedInt32x8(v) + case OpAddDotProdQuadrupleSaturatedInt32x16: + v.Op = OpAMD64VPDPBUSDS512 return true - case OpAddDotProdMaskedInt32x16: - return rewriteValueAMD64_OpAddDotProdMaskedInt32x16(v) - case OpAddDotProdMaskedInt32x4: - return rewriteValueAMD64_OpAddDotProdMaskedInt32x4(v) - case OpAddDotProdMaskedInt32x8: - return rewriteValueAMD64_OpAddDotProdMaskedInt32x8(v) + case OpAddDotProdQuadrupleSaturatedInt32x4: + v.Op = OpAMD64VPDPBUSDS128 + return true + case OpAddDotProdQuadrupleSaturatedInt32x8: + v.Op = OpAMD64VPDPBUSDS256 + return true + case OpAddDotProdQuadrupleSaturatedMaskedInt32x16: + return rewriteValueAMD64_OpAddDotProdQuadrupleSaturatedMaskedInt32x16(v) + case OpAddDotProdQuadrupleSaturatedMaskedInt32x4: + return rewriteValueAMD64_OpAddDotProdQuadrupleSaturatedMaskedInt32x4(v) + case OpAddDotProdQuadrupleSaturatedMaskedInt32x8: + return rewriteValueAMD64_OpAddDotProdQuadrupleSaturatedMaskedInt32x8(v) case OpAddFloat32x16: v.Op = OpAMD64VADDPS512 return true @@ -854,22 +884,22 @@ func rewriteValueAMD64(v *Value) bool { case OpAddSaturatedMaskedUint8x64: return rewriteValueAMD64_OpAddSaturatedMaskedUint8x64(v) case OpAddSaturatedUint16x16: - v.Op = OpAMD64VPADDSW256 + v.Op = OpAMD64VPADDUSW256 return true case OpAddSaturatedUint16x32: - v.Op = OpAMD64VPADDSW512 + v.Op = OpAMD64VPADDUSW512 return true case OpAddSaturatedUint16x8: - v.Op = OpAMD64VPADDSW128 + v.Op = OpAMD64VPADDUSW128 return true case OpAddSaturatedUint8x16: - v.Op = OpAMD64VPADDSB128 + v.Op = OpAMD64VPADDUSB128 return true case OpAddSaturatedUint8x32: - v.Op = OpAMD64VPADDSB256 + v.Op = OpAMD64VPADDUSB256 return true case OpAddSaturatedUint8x64: - v.Op = OpAMD64VPADDSB512 + v.Op = OpAMD64VPADDUSB512 return true case OpAddSubFloat32x4: v.Op = OpAMD64VADDSUBPS128 @@ -1128,66 +1158,6 @@ func rewriteValueAMD64(v *Value) bool { case OpAndUint8x64: v.Op = OpAMD64VPANDD512 return true - case OpApproximateReciprocalFloat32x16: - v.Op = OpAMD64VRCP14PS512 - return true - case OpApproximateReciprocalFloat32x4: - v.Op = OpAMD64VRCPPS128 - return true - case OpApproximateReciprocalFloat32x8: - v.Op = OpAMD64VRCPPS256 - return true - case OpApproximateReciprocalFloat64x2: - v.Op = OpAMD64VRCP14PD128 - return true - case OpApproximateReciprocalFloat64x4: - v.Op = OpAMD64VRCP14PD256 - return true - case OpApproximateReciprocalFloat64x8: - v.Op = OpAMD64VRCP14PD512 - return true - case OpApproximateReciprocalMaskedFloat32x16: - return rewriteValueAMD64_OpApproximateReciprocalMaskedFloat32x16(v) - case OpApproximateReciprocalMaskedFloat32x4: - return rewriteValueAMD64_OpApproximateReciprocalMaskedFloat32x4(v) - case OpApproximateReciprocalMaskedFloat32x8: - return rewriteValueAMD64_OpApproximateReciprocalMaskedFloat32x8(v) - case OpApproximateReciprocalMaskedFloat64x2: - return rewriteValueAMD64_OpApproximateReciprocalMaskedFloat64x2(v) - case OpApproximateReciprocalMaskedFloat64x4: - return rewriteValueAMD64_OpApproximateReciprocalMaskedFloat64x4(v) - case OpApproximateReciprocalMaskedFloat64x8: - return rewriteValueAMD64_OpApproximateReciprocalMaskedFloat64x8(v) - case OpApproximateReciprocalOfSqrtFloat32x16: - v.Op = OpAMD64VRSQRT14PS512 - return true - case OpApproximateReciprocalOfSqrtFloat32x4: - v.Op = OpAMD64VRSQRTPS128 - return true - case OpApproximateReciprocalOfSqrtFloat32x8: - v.Op = OpAMD64VRSQRTPS256 - return true - case OpApproximateReciprocalOfSqrtFloat64x2: - v.Op = OpAMD64VRSQRT14PD128 - return true - case OpApproximateReciprocalOfSqrtFloat64x4: - v.Op = OpAMD64VRSQRT14PD256 - return true - case OpApproximateReciprocalOfSqrtFloat64x8: - v.Op = OpAMD64VRSQRT14PD512 - return true - case OpApproximateReciprocalOfSqrtMaskedFloat32x16: - return rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat32x16(v) - case OpApproximateReciprocalOfSqrtMaskedFloat32x4: - return rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat32x4(v) - case OpApproximateReciprocalOfSqrtMaskedFloat32x8: - return rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat32x8(v) - case OpApproximateReciprocalOfSqrtMaskedFloat64x2: - return rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat64x2(v) - case OpApproximateReciprocalOfSqrtMaskedFloat64x4: - return rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat64x4(v) - case OpApproximateReciprocalOfSqrtMaskedFloat64x8: - return rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat64x8(v) case OpAtomicAdd32: return rewriteValueAMD64_OpAtomicAdd32(v) case OpAtomicAdd64: @@ -1468,6 +1438,24 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpConvertToUint32MaskedFloat32x4(v) case OpConvertToUint32MaskedFloat32x8: return rewriteValueAMD64_OpConvertToUint32MaskedFloat32x8(v) + case OpCopySignInt16x16: + v.Op = OpAMD64VPSIGNW256 + return true + case OpCopySignInt16x8: + v.Op = OpAMD64VPSIGNW128 + return true + case OpCopySignInt32x4: + v.Op = OpAMD64VPSIGND128 + return true + case OpCopySignInt32x8: + v.Op = OpAMD64VPSIGND256 + return true + case OpCopySignInt8x16: + v.Op = OpAMD64VPSIGNB128 + return true + case OpCopySignInt8x32: + v.Op = OpAMD64VPSIGNB256 + return true case OpCtz16: return rewriteValueAMD64_OpCtz16(v) case OpCtz16NonZero: @@ -1620,12 +1608,36 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpDivMaskedFloat64x4(v) case OpDivMaskedFloat64x8: return rewriteValueAMD64_OpDivMaskedFloat64x8(v) - case OpDotProdBroadcastFloat32x4: - return rewriteValueAMD64_OpDotProdBroadcastFloat32x4(v) - case OpDotProdBroadcastFloat32x8: - return rewriteValueAMD64_OpDotProdBroadcastFloat32x8(v) - case OpDotProdBroadcastFloat64x2: - return rewriteValueAMD64_OpDotProdBroadcastFloat64x2(v) + case OpDotProdPairsInt16x16: + v.Op = OpAMD64VPMADDWD256 + return true + case OpDotProdPairsInt16x32: + v.Op = OpAMD64VPMADDWD512 + return true + case OpDotProdPairsInt16x8: + v.Op = OpAMD64VPMADDWD128 + return true + case OpDotProdPairsMaskedInt16x16: + return rewriteValueAMD64_OpDotProdPairsMaskedInt16x16(v) + case OpDotProdPairsMaskedInt16x32: + return rewriteValueAMD64_OpDotProdPairsMaskedInt16x32(v) + case OpDotProdPairsMaskedInt16x8: + return rewriteValueAMD64_OpDotProdPairsMaskedInt16x8(v) + case OpDotProdPairsSaturatedMaskedUint8x16: + return rewriteValueAMD64_OpDotProdPairsSaturatedMaskedUint8x16(v) + case OpDotProdPairsSaturatedMaskedUint8x32: + return rewriteValueAMD64_OpDotProdPairsSaturatedMaskedUint8x32(v) + case OpDotProdPairsSaturatedMaskedUint8x64: + return rewriteValueAMD64_OpDotProdPairsSaturatedMaskedUint8x64(v) + case OpDotProdPairsSaturatedUint8x16: + v.Op = OpAMD64VPMADDUBSW128 + return true + case OpDotProdPairsSaturatedUint8x32: + v.Op = OpAMD64VPMADDUBSW256 + return true + case OpDotProdPairsSaturatedUint8x64: + v.Op = OpAMD64VPMADDUBSW512 + return true case OpEq16: return rewriteValueAMD64_OpEq16(v) case OpEq32: @@ -1898,96 +1910,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x4(v) case OpFloorScaledResidueMaskedFloat64x8: return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x8(v) - case OpFusedMultiplyAddFloat32x16: - v.Op = OpAMD64VFMADD213PS512 - return true - case OpFusedMultiplyAddFloat32x4: - v.Op = OpAMD64VFMADD213PS128 - return true - case OpFusedMultiplyAddFloat32x8: - v.Op = OpAMD64VFMADD213PS256 - return true - case OpFusedMultiplyAddFloat64x2: - v.Op = OpAMD64VFMADD213PD128 - return true - case OpFusedMultiplyAddFloat64x4: - v.Op = OpAMD64VFMADD213PD256 - return true - case OpFusedMultiplyAddFloat64x8: - v.Op = OpAMD64VFMADD213PD512 - return true - case OpFusedMultiplyAddMaskedFloat32x16: - return rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x16(v) - case OpFusedMultiplyAddMaskedFloat32x4: - return rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x4(v) - case OpFusedMultiplyAddMaskedFloat32x8: - return rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x8(v) - case OpFusedMultiplyAddMaskedFloat64x2: - return rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat64x2(v) - case OpFusedMultiplyAddMaskedFloat64x4: - return rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat64x4(v) - case OpFusedMultiplyAddMaskedFloat64x8: - return rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat64x8(v) - case OpFusedMultiplyAddSubFloat32x16: - v.Op = OpAMD64VFMADDSUB213PS512 - return true - case OpFusedMultiplyAddSubFloat32x4: - v.Op = OpAMD64VFMADDSUB213PS128 - return true - case OpFusedMultiplyAddSubFloat32x8: - v.Op = OpAMD64VFMADDSUB213PS256 - return true - case OpFusedMultiplyAddSubFloat64x2: - v.Op = OpAMD64VFMADDSUB213PD128 - return true - case OpFusedMultiplyAddSubFloat64x4: - v.Op = OpAMD64VFMADDSUB213PD256 - return true - case OpFusedMultiplyAddSubFloat64x8: - v.Op = OpAMD64VFMADDSUB213PD512 - return true - case OpFusedMultiplyAddSubMaskedFloat32x16: - return rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat32x16(v) - case OpFusedMultiplyAddSubMaskedFloat32x4: - return rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat32x4(v) - case OpFusedMultiplyAddSubMaskedFloat32x8: - return rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat32x8(v) - case OpFusedMultiplyAddSubMaskedFloat64x2: - return rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat64x2(v) - case OpFusedMultiplyAddSubMaskedFloat64x4: - return rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat64x4(v) - case OpFusedMultiplyAddSubMaskedFloat64x8: - return rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat64x8(v) - case OpFusedMultiplySubAddFloat32x16: - v.Op = OpAMD64VFMSUBADD213PS512 - return true - case OpFusedMultiplySubAddFloat32x4: - v.Op = OpAMD64VFMSUBADD213PS128 - return true - case OpFusedMultiplySubAddFloat32x8: - v.Op = OpAMD64VFMSUBADD213PS256 - return true - case OpFusedMultiplySubAddFloat64x2: - v.Op = OpAMD64VFMSUBADD213PD128 - return true - case OpFusedMultiplySubAddFloat64x4: - v.Op = OpAMD64VFMSUBADD213PD256 - return true - case OpFusedMultiplySubAddFloat64x8: - v.Op = OpAMD64VFMSUBADD213PD512 - return true - case OpFusedMultiplySubAddMaskedFloat32x16: - return rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat32x16(v) - case OpFusedMultiplySubAddMaskedFloat32x4: - return rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat32x4(v) - case OpFusedMultiplySubAddMaskedFloat32x8: - return rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat32x8(v) - case OpFusedMultiplySubAddMaskedFloat64x2: - return rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat64x2(v) - case OpFusedMultiplySubAddMaskedFloat64x4: - return rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat64x4(v) - case OpFusedMultiplySubAddMaskedFloat64x8: - return rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat64x8(v) case OpGaloisFieldAffineTransformInverseMaskedUint8x16: return rewriteValueAMD64_OpGaloisFieldAffineTransformInverseMaskedUint8x16(v) case OpGaloisFieldAffineTransformInverseMaskedUint8x32: @@ -3138,48 +3060,78 @@ func rewriteValueAMD64(v *Value) bool { case OpMul8: v.Op = OpAMD64MULL return true - case OpMulEvenWidenInt32x4: - v.Op = OpAMD64VPMULDQ128 + case OpMulAddFloat32x16: + v.Op = OpAMD64VFMADD213PS512 return true - case OpMulEvenWidenInt32x8: - v.Op = OpAMD64VPMULDQ256 + case OpMulAddFloat32x4: + v.Op = OpAMD64VFMADD213PS128 + return true + case OpMulAddFloat32x8: + v.Op = OpAMD64VFMADD213PS256 + return true + case OpMulAddFloat64x2: + v.Op = OpAMD64VFMADD213PD128 + return true + case OpMulAddFloat64x4: + v.Op = OpAMD64VFMADD213PD256 return true - case OpMulEvenWidenInt64x2: + case OpMulAddFloat64x8: + v.Op = OpAMD64VFMADD213PD512 + return true + case OpMulAddMaskedFloat32x16: + return rewriteValueAMD64_OpMulAddMaskedFloat32x16(v) + case OpMulAddMaskedFloat32x4: + return rewriteValueAMD64_OpMulAddMaskedFloat32x4(v) + case OpMulAddMaskedFloat32x8: + return rewriteValueAMD64_OpMulAddMaskedFloat32x8(v) + case OpMulAddMaskedFloat64x2: + return rewriteValueAMD64_OpMulAddMaskedFloat64x2(v) + case OpMulAddMaskedFloat64x4: + return rewriteValueAMD64_OpMulAddMaskedFloat64x4(v) + case OpMulAddMaskedFloat64x8: + return rewriteValueAMD64_OpMulAddMaskedFloat64x8(v) + case OpMulAddSubFloat32x16: + v.Op = OpAMD64VFMADDSUB213PS512 + return true + case OpMulAddSubFloat32x4: + v.Op = OpAMD64VFMADDSUB213PS128 + return true + case OpMulAddSubFloat32x8: + v.Op = OpAMD64VFMADDSUB213PS256 + return true + case OpMulAddSubFloat64x2: + v.Op = OpAMD64VFMADDSUB213PD128 + return true + case OpMulAddSubFloat64x4: + v.Op = OpAMD64VFMADDSUB213PD256 + return true + case OpMulAddSubFloat64x8: + v.Op = OpAMD64VFMADDSUB213PD512 + return true + case OpMulAddSubMaskedFloat32x16: + return rewriteValueAMD64_OpMulAddSubMaskedFloat32x16(v) + case OpMulAddSubMaskedFloat32x4: + return rewriteValueAMD64_OpMulAddSubMaskedFloat32x4(v) + case OpMulAddSubMaskedFloat32x8: + return rewriteValueAMD64_OpMulAddSubMaskedFloat32x8(v) + case OpMulAddSubMaskedFloat64x2: + return rewriteValueAMD64_OpMulAddSubMaskedFloat64x2(v) + case OpMulAddSubMaskedFloat64x4: + return rewriteValueAMD64_OpMulAddSubMaskedFloat64x4(v) + case OpMulAddSubMaskedFloat64x8: + return rewriteValueAMD64_OpMulAddSubMaskedFloat64x8(v) + case OpMulEvenWidenInt32x4: v.Op = OpAMD64VPMULDQ128 return true - case OpMulEvenWidenInt64x4: + case OpMulEvenWidenInt32x8: v.Op = OpAMD64VPMULDQ256 return true - case OpMulEvenWidenInt64x8: - v.Op = OpAMD64VPMULDQ512 - return true - case OpMulEvenWidenMaskedInt64x2: - return rewriteValueAMD64_OpMulEvenWidenMaskedInt64x2(v) - case OpMulEvenWidenMaskedInt64x4: - return rewriteValueAMD64_OpMulEvenWidenMaskedInt64x4(v) - case OpMulEvenWidenMaskedInt64x8: - return rewriteValueAMD64_OpMulEvenWidenMaskedInt64x8(v) - case OpMulEvenWidenMaskedUint64x2: - return rewriteValueAMD64_OpMulEvenWidenMaskedUint64x2(v) - case OpMulEvenWidenMaskedUint64x4: - return rewriteValueAMD64_OpMulEvenWidenMaskedUint64x4(v) - case OpMulEvenWidenMaskedUint64x8: - return rewriteValueAMD64_OpMulEvenWidenMaskedUint64x8(v) case OpMulEvenWidenUint32x4: v.Op = OpAMD64VPMULUDQ128 return true case OpMulEvenWidenUint32x8: v.Op = OpAMD64VPMULUDQ256 return true - case OpMulEvenWidenUint64x2: - v.Op = OpAMD64VPMULUDQ128 - return true - case OpMulEvenWidenUint64x4: - v.Op = OpAMD64VPMULUDQ256 - return true - case OpMulEvenWidenUint64x8: - v.Op = OpAMD64VPMULUDQ512 - return true case OpMulFloat32x16: v.Op = OpAMD64VMULPS512 return true @@ -3199,13 +3151,13 @@ func rewriteValueAMD64(v *Value) bool { v.Op = OpAMD64VMULPD512 return true case OpMulHighInt16x16: - v.Op = OpAMD64VPMULHW256 + v.Op = OpAMD64VPMULHUW256 return true case OpMulHighInt16x32: v.Op = OpAMD64VPMULHW512 return true case OpMulHighInt16x8: - v.Op = OpAMD64VPMULHW128 + v.Op = OpAMD64VPMULHUW128 return true case OpMulHighMaskedInt16x16: return rewriteValueAMD64_OpMulHighMaskedInt16x16(v) @@ -3213,21 +3165,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpMulHighMaskedInt16x32(v) case OpMulHighMaskedInt16x8: return rewriteValueAMD64_OpMulHighMaskedInt16x8(v) - case OpMulHighMaskedUint16x16: - return rewriteValueAMD64_OpMulHighMaskedUint16x16(v) - case OpMulHighMaskedUint16x32: - return rewriteValueAMD64_OpMulHighMaskedUint16x32(v) - case OpMulHighMaskedUint16x8: - return rewriteValueAMD64_OpMulHighMaskedUint16x8(v) - case OpMulHighUint16x16: - v.Op = OpAMD64VPMULHUW256 - return true - case OpMulHighUint16x32: - v.Op = OpAMD64VPMULHUW512 - return true - case OpMulHighUint16x8: - v.Op = OpAMD64VPMULHUW128 - return true case OpMulInt16x16: v.Op = OpAMD64VPMULLW256 return true @@ -3285,6 +3222,81 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpMulMaskedInt64x4(v) case OpMulMaskedInt64x8: return rewriteValueAMD64_OpMulMaskedInt64x8(v) + case OpMulMaskedUint16x16: + return rewriteValueAMD64_OpMulMaskedUint16x16(v) + case OpMulMaskedUint16x32: + return rewriteValueAMD64_OpMulMaskedUint16x32(v) + case OpMulMaskedUint16x8: + return rewriteValueAMD64_OpMulMaskedUint16x8(v) + case OpMulMaskedUint32x16: + return rewriteValueAMD64_OpMulMaskedUint32x16(v) + case OpMulMaskedUint32x4: + return rewriteValueAMD64_OpMulMaskedUint32x4(v) + case OpMulMaskedUint32x8: + return rewriteValueAMD64_OpMulMaskedUint32x8(v) + case OpMulMaskedUint64x2: + return rewriteValueAMD64_OpMulMaskedUint64x2(v) + case OpMulMaskedUint64x4: + return rewriteValueAMD64_OpMulMaskedUint64x4(v) + case OpMulMaskedUint64x8: + return rewriteValueAMD64_OpMulMaskedUint64x8(v) + case OpMulSubAddFloat32x16: + v.Op = OpAMD64VFMSUBADD213PS512 + return true + case OpMulSubAddFloat32x4: + v.Op = OpAMD64VFMSUBADD213PS128 + return true + case OpMulSubAddFloat32x8: + v.Op = OpAMD64VFMSUBADD213PS256 + return true + case OpMulSubAddFloat64x2: + v.Op = OpAMD64VFMSUBADD213PD128 + return true + case OpMulSubAddFloat64x4: + v.Op = OpAMD64VFMSUBADD213PD256 + return true + case OpMulSubAddFloat64x8: + v.Op = OpAMD64VFMSUBADD213PD512 + return true + case OpMulSubAddMaskedFloat32x16: + return rewriteValueAMD64_OpMulSubAddMaskedFloat32x16(v) + case OpMulSubAddMaskedFloat32x4: + return rewriteValueAMD64_OpMulSubAddMaskedFloat32x4(v) + case OpMulSubAddMaskedFloat32x8: + return rewriteValueAMD64_OpMulSubAddMaskedFloat32x8(v) + case OpMulSubAddMaskedFloat64x2: + return rewriteValueAMD64_OpMulSubAddMaskedFloat64x2(v) + case OpMulSubAddMaskedFloat64x4: + return rewriteValueAMD64_OpMulSubAddMaskedFloat64x4(v) + case OpMulSubAddMaskedFloat64x8: + return rewriteValueAMD64_OpMulSubAddMaskedFloat64x8(v) + case OpMulUint16x16: + v.Op = OpAMD64VPMULLW256 + return true + case OpMulUint16x32: + v.Op = OpAMD64VPMULLW512 + return true + case OpMulUint16x8: + v.Op = OpAMD64VPMULLW128 + return true + case OpMulUint32x16: + v.Op = OpAMD64VPMULLD512 + return true + case OpMulUint32x4: + v.Op = OpAMD64VPMULLD128 + return true + case OpMulUint32x8: + v.Op = OpAMD64VPMULLD256 + return true + case OpMulUint64x2: + v.Op = OpAMD64VPMULLQ128 + return true + case OpMulUint64x4: + v.Op = OpAMD64VPMULLQ256 + return true + case OpMulUint64x8: + v.Op = OpAMD64VPMULLQ512 + return true case OpNeg16: v.Op = OpAMD64NEGL return true @@ -3444,6 +3456,126 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpNotEqualUint8x64(v) case OpOffPtr: return rewriteValueAMD64_OpOffPtr(v) + case OpOnesCountInt16x16: + v.Op = OpAMD64VPOPCNTW256 + return true + case OpOnesCountInt16x32: + v.Op = OpAMD64VPOPCNTW512 + return true + case OpOnesCountInt16x8: + v.Op = OpAMD64VPOPCNTW128 + return true + case OpOnesCountInt32x16: + v.Op = OpAMD64VPOPCNTD512 + return true + case OpOnesCountInt32x4: + v.Op = OpAMD64VPOPCNTD128 + return true + case OpOnesCountInt32x8: + v.Op = OpAMD64VPOPCNTD256 + return true + case OpOnesCountInt64x2: + v.Op = OpAMD64VPOPCNTQ128 + return true + case OpOnesCountInt64x4: + v.Op = OpAMD64VPOPCNTQ256 + return true + case OpOnesCountInt64x8: + v.Op = OpAMD64VPOPCNTQ512 + return true + case OpOnesCountInt8x16: + v.Op = OpAMD64VPOPCNTB128 + return true + case OpOnesCountInt8x32: + v.Op = OpAMD64VPOPCNTB256 + return true + case OpOnesCountInt8x64: + v.Op = OpAMD64VPOPCNTB512 + return true + case OpOnesCountMaskedInt16x16: + return rewriteValueAMD64_OpOnesCountMaskedInt16x16(v) + case OpOnesCountMaskedInt16x32: + return rewriteValueAMD64_OpOnesCountMaskedInt16x32(v) + case OpOnesCountMaskedInt16x8: + return rewriteValueAMD64_OpOnesCountMaskedInt16x8(v) + case OpOnesCountMaskedInt32x16: + return rewriteValueAMD64_OpOnesCountMaskedInt32x16(v) + case OpOnesCountMaskedInt32x4: + return rewriteValueAMD64_OpOnesCountMaskedInt32x4(v) + case OpOnesCountMaskedInt32x8: + return rewriteValueAMD64_OpOnesCountMaskedInt32x8(v) + case OpOnesCountMaskedInt64x2: + return rewriteValueAMD64_OpOnesCountMaskedInt64x2(v) + case OpOnesCountMaskedInt64x4: + return rewriteValueAMD64_OpOnesCountMaskedInt64x4(v) + case OpOnesCountMaskedInt64x8: + return rewriteValueAMD64_OpOnesCountMaskedInt64x8(v) + case OpOnesCountMaskedInt8x16: + return rewriteValueAMD64_OpOnesCountMaskedInt8x16(v) + case OpOnesCountMaskedInt8x32: + return rewriteValueAMD64_OpOnesCountMaskedInt8x32(v) + case OpOnesCountMaskedInt8x64: + return rewriteValueAMD64_OpOnesCountMaskedInt8x64(v) + case OpOnesCountMaskedUint16x16: + return rewriteValueAMD64_OpOnesCountMaskedUint16x16(v) + case OpOnesCountMaskedUint16x32: + return rewriteValueAMD64_OpOnesCountMaskedUint16x32(v) + case OpOnesCountMaskedUint16x8: + return rewriteValueAMD64_OpOnesCountMaskedUint16x8(v) + case OpOnesCountMaskedUint32x16: + return rewriteValueAMD64_OpOnesCountMaskedUint32x16(v) + case OpOnesCountMaskedUint32x4: + return rewriteValueAMD64_OpOnesCountMaskedUint32x4(v) + case OpOnesCountMaskedUint32x8: + return rewriteValueAMD64_OpOnesCountMaskedUint32x8(v) + case OpOnesCountMaskedUint64x2: + return rewriteValueAMD64_OpOnesCountMaskedUint64x2(v) + case OpOnesCountMaskedUint64x4: + return rewriteValueAMD64_OpOnesCountMaskedUint64x4(v) + case OpOnesCountMaskedUint64x8: + return rewriteValueAMD64_OpOnesCountMaskedUint64x8(v) + case OpOnesCountMaskedUint8x16: + return rewriteValueAMD64_OpOnesCountMaskedUint8x16(v) + case OpOnesCountMaskedUint8x32: + return rewriteValueAMD64_OpOnesCountMaskedUint8x32(v) + case OpOnesCountMaskedUint8x64: + return rewriteValueAMD64_OpOnesCountMaskedUint8x64(v) + case OpOnesCountUint16x16: + v.Op = OpAMD64VPOPCNTW256 + return true + case OpOnesCountUint16x32: + v.Op = OpAMD64VPOPCNTW512 + return true + case OpOnesCountUint16x8: + v.Op = OpAMD64VPOPCNTW128 + return true + case OpOnesCountUint32x16: + v.Op = OpAMD64VPOPCNTD512 + return true + case OpOnesCountUint32x4: + v.Op = OpAMD64VPOPCNTD128 + return true + case OpOnesCountUint32x8: + v.Op = OpAMD64VPOPCNTD256 + return true + case OpOnesCountUint64x2: + v.Op = OpAMD64VPOPCNTQ128 + return true + case OpOnesCountUint64x4: + v.Op = OpAMD64VPOPCNTQ256 + return true + case OpOnesCountUint64x8: + v.Op = OpAMD64VPOPCNTQ512 + return true + case OpOnesCountUint8x16: + v.Op = OpAMD64VPOPCNTB128 + return true + case OpOnesCountUint8x32: + v.Op = OpAMD64VPOPCNTB256 + return true + case OpOnesCountUint8x64: + v.Op = OpAMD64VPOPCNTB512 + return true case OpOr16: v.Op = OpAMD64ORL return true @@ -3555,21 +3687,6 @@ func rewriteValueAMD64(v *Value) bool { case OpOrUint8x64: v.Op = OpAMD64VPORD512 return true - case OpPairDotProdInt16x16: - v.Op = OpAMD64VPMADDWD256 - return true - case OpPairDotProdInt16x32: - v.Op = OpAMD64VPMADDWD512 - return true - case OpPairDotProdInt16x8: - v.Op = OpAMD64VPMADDWD128 - return true - case OpPairDotProdMaskedInt16x16: - return rewriteValueAMD64_OpPairDotProdMaskedInt16x16(v) - case OpPairDotProdMaskedInt16x32: - return rewriteValueAMD64_OpPairDotProdMaskedInt16x32(v) - case OpPairDotProdMaskedInt16x8: - return rewriteValueAMD64_OpPairDotProdMaskedInt16x8(v) case OpPanicBounds: v.Op = OpAMD64LoweredPanicBoundsRR return true @@ -3853,132 +3970,72 @@ func rewriteValueAMD64(v *Value) bool { return true case OpPopCount8: return rewriteValueAMD64_OpPopCount8(v) - case OpPopCountInt16x16: - v.Op = OpAMD64VPOPCNTW256 - return true - case OpPopCountInt16x32: - v.Op = OpAMD64VPOPCNTW512 - return true - case OpPopCountInt16x8: - v.Op = OpAMD64VPOPCNTW128 - return true - case OpPopCountInt32x16: - v.Op = OpAMD64VPOPCNTD512 - return true - case OpPopCountInt32x4: - v.Op = OpAMD64VPOPCNTD128 - return true - case OpPopCountInt32x8: - v.Op = OpAMD64VPOPCNTD256 - return true - case OpPopCountInt64x2: - v.Op = OpAMD64VPOPCNTQ128 - return true - case OpPopCountInt64x4: - v.Op = OpAMD64VPOPCNTQ256 - return true - case OpPopCountInt64x8: - v.Op = OpAMD64VPOPCNTQ512 - return true - case OpPopCountInt8x16: - v.Op = OpAMD64VPOPCNTB128 - return true - case OpPopCountInt8x32: - v.Op = OpAMD64VPOPCNTB256 - return true - case OpPopCountInt8x64: - v.Op = OpAMD64VPOPCNTB512 - return true - case OpPopCountMaskedInt16x16: - return rewriteValueAMD64_OpPopCountMaskedInt16x16(v) - case OpPopCountMaskedInt16x32: - return rewriteValueAMD64_OpPopCountMaskedInt16x32(v) - case OpPopCountMaskedInt16x8: - return rewriteValueAMD64_OpPopCountMaskedInt16x8(v) - case OpPopCountMaskedInt32x16: - return rewriteValueAMD64_OpPopCountMaskedInt32x16(v) - case OpPopCountMaskedInt32x4: - return rewriteValueAMD64_OpPopCountMaskedInt32x4(v) - case OpPopCountMaskedInt32x8: - return rewriteValueAMD64_OpPopCountMaskedInt32x8(v) - case OpPopCountMaskedInt64x2: - return rewriteValueAMD64_OpPopCountMaskedInt64x2(v) - case OpPopCountMaskedInt64x4: - return rewriteValueAMD64_OpPopCountMaskedInt64x4(v) - case OpPopCountMaskedInt64x8: - return rewriteValueAMD64_OpPopCountMaskedInt64x8(v) - case OpPopCountMaskedInt8x16: - return rewriteValueAMD64_OpPopCountMaskedInt8x16(v) - case OpPopCountMaskedInt8x32: - return rewriteValueAMD64_OpPopCountMaskedInt8x32(v) - case OpPopCountMaskedInt8x64: - return rewriteValueAMD64_OpPopCountMaskedInt8x64(v) - case OpPopCountMaskedUint16x16: - return rewriteValueAMD64_OpPopCountMaskedUint16x16(v) - case OpPopCountMaskedUint16x32: - return rewriteValueAMD64_OpPopCountMaskedUint16x32(v) - case OpPopCountMaskedUint16x8: - return rewriteValueAMD64_OpPopCountMaskedUint16x8(v) - case OpPopCountMaskedUint32x16: - return rewriteValueAMD64_OpPopCountMaskedUint32x16(v) - case OpPopCountMaskedUint32x4: - return rewriteValueAMD64_OpPopCountMaskedUint32x4(v) - case OpPopCountMaskedUint32x8: - return rewriteValueAMD64_OpPopCountMaskedUint32x8(v) - case OpPopCountMaskedUint64x2: - return rewriteValueAMD64_OpPopCountMaskedUint64x2(v) - case OpPopCountMaskedUint64x4: - return rewriteValueAMD64_OpPopCountMaskedUint64x4(v) - case OpPopCountMaskedUint64x8: - return rewriteValueAMD64_OpPopCountMaskedUint64x8(v) - case OpPopCountMaskedUint8x16: - return rewriteValueAMD64_OpPopCountMaskedUint8x16(v) - case OpPopCountMaskedUint8x32: - return rewriteValueAMD64_OpPopCountMaskedUint8x32(v) - case OpPopCountMaskedUint8x64: - return rewriteValueAMD64_OpPopCountMaskedUint8x64(v) - case OpPopCountUint16x16: - v.Op = OpAMD64VPOPCNTW256 + case OpPrefetchCache: + v.Op = OpAMD64PrefetchT0 return true - case OpPopCountUint16x32: - v.Op = OpAMD64VPOPCNTW512 + case OpPrefetchCacheStreamed: + v.Op = OpAMD64PrefetchNTA return true - case OpPopCountUint16x8: - v.Op = OpAMD64VPOPCNTW128 + case OpReciprocalFloat32x16: + v.Op = OpAMD64VRCP14PS512 return true - case OpPopCountUint32x16: - v.Op = OpAMD64VPOPCNTD512 + case OpReciprocalFloat32x4: + v.Op = OpAMD64VRCPPS128 return true - case OpPopCountUint32x4: - v.Op = OpAMD64VPOPCNTD128 + case OpReciprocalFloat32x8: + v.Op = OpAMD64VRCPPS256 return true - case OpPopCountUint32x8: - v.Op = OpAMD64VPOPCNTD256 + case OpReciprocalFloat64x2: + v.Op = OpAMD64VRCP14PD128 return true - case OpPopCountUint64x2: - v.Op = OpAMD64VPOPCNTQ128 + case OpReciprocalFloat64x4: + v.Op = OpAMD64VRCP14PD256 return true - case OpPopCountUint64x4: - v.Op = OpAMD64VPOPCNTQ256 + case OpReciprocalFloat64x8: + v.Op = OpAMD64VRCP14PD512 return true - case OpPopCountUint64x8: - v.Op = OpAMD64VPOPCNTQ512 + case OpReciprocalMaskedFloat32x16: + return rewriteValueAMD64_OpReciprocalMaskedFloat32x16(v) + case OpReciprocalMaskedFloat32x4: + return rewriteValueAMD64_OpReciprocalMaskedFloat32x4(v) + case OpReciprocalMaskedFloat32x8: + return rewriteValueAMD64_OpReciprocalMaskedFloat32x8(v) + case OpReciprocalMaskedFloat64x2: + return rewriteValueAMD64_OpReciprocalMaskedFloat64x2(v) + case OpReciprocalMaskedFloat64x4: + return rewriteValueAMD64_OpReciprocalMaskedFloat64x4(v) + case OpReciprocalMaskedFloat64x8: + return rewriteValueAMD64_OpReciprocalMaskedFloat64x8(v) + case OpReciprocalSqrtFloat32x16: + v.Op = OpAMD64VRSQRT14PS512 return true - case OpPopCountUint8x16: - v.Op = OpAMD64VPOPCNTB128 + case OpReciprocalSqrtFloat32x4: + v.Op = OpAMD64VRSQRTPS128 return true - case OpPopCountUint8x32: - v.Op = OpAMD64VPOPCNTB256 + case OpReciprocalSqrtFloat32x8: + v.Op = OpAMD64VRSQRTPS256 return true - case OpPopCountUint8x64: - v.Op = OpAMD64VPOPCNTB512 + case OpReciprocalSqrtFloat64x2: + v.Op = OpAMD64VRSQRT14PD128 return true - case OpPrefetchCache: - v.Op = OpAMD64PrefetchT0 + case OpReciprocalSqrtFloat64x4: + v.Op = OpAMD64VRSQRT14PD256 return true - case OpPrefetchCacheStreamed: - v.Op = OpAMD64PrefetchNTA + case OpReciprocalSqrtFloat64x8: + v.Op = OpAMD64VRSQRT14PD512 return true + case OpReciprocalSqrtMaskedFloat32x16: + return rewriteValueAMD64_OpReciprocalSqrtMaskedFloat32x16(v) + case OpReciprocalSqrtMaskedFloat32x4: + return rewriteValueAMD64_OpReciprocalSqrtMaskedFloat32x4(v) + case OpReciprocalSqrtMaskedFloat32x8: + return rewriteValueAMD64_OpReciprocalSqrtMaskedFloat32x8(v) + case OpReciprocalSqrtMaskedFloat64x2: + return rewriteValueAMD64_OpReciprocalSqrtMaskedFloat64x2(v) + case OpReciprocalSqrtMaskedFloat64x4: + return rewriteValueAMD64_OpReciprocalSqrtMaskedFloat64x4(v) + case OpReciprocalSqrtMaskedFloat64x8: + return rewriteValueAMD64_OpReciprocalSqrtMaskedFloat64x8(v) case OpRotateAllLeftInt32x16: v.Op = OpAMD64VPROLD512 return true @@ -4237,64 +4294,64 @@ func rewriteValueAMD64(v *Value) bool { case OpRound64F: v.Op = OpAMD64LoweredRound64F return true - case OpRoundFloat32x4: - return rewriteValueAMD64_OpRoundFloat32x4(v) - case OpRoundFloat32x8: - return rewriteValueAMD64_OpRoundFloat32x8(v) - case OpRoundFloat64x2: - return rewriteValueAMD64_OpRoundFloat64x2(v) - case OpRoundFloat64x4: - return rewriteValueAMD64_OpRoundFloat64x4(v) - case OpRoundScaledFloat32x16: - return rewriteValueAMD64_OpRoundScaledFloat32x16(v) - case OpRoundScaledFloat32x4: - return rewriteValueAMD64_OpRoundScaledFloat32x4(v) - case OpRoundScaledFloat32x8: - return rewriteValueAMD64_OpRoundScaledFloat32x8(v) - case OpRoundScaledFloat64x2: - return rewriteValueAMD64_OpRoundScaledFloat64x2(v) - case OpRoundScaledFloat64x4: - return rewriteValueAMD64_OpRoundScaledFloat64x4(v) - case OpRoundScaledFloat64x8: - return rewriteValueAMD64_OpRoundScaledFloat64x8(v) - case OpRoundScaledMaskedFloat32x16: - return rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v) - case OpRoundScaledMaskedFloat32x4: - return rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v) - case OpRoundScaledMaskedFloat32x8: - return rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v) - case OpRoundScaledMaskedFloat64x2: - return rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v) - case OpRoundScaledMaskedFloat64x4: - return rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v) - case OpRoundScaledMaskedFloat64x8: - return rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v) - case OpRoundScaledResidueFloat32x16: - return rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v) - case OpRoundScaledResidueFloat32x4: - return rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v) - case OpRoundScaledResidueFloat32x8: - return rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v) - case OpRoundScaledResidueFloat64x2: - return rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v) - case OpRoundScaledResidueFloat64x4: - return rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v) - case OpRoundScaledResidueFloat64x8: - return rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v) - case OpRoundScaledResidueMaskedFloat32x16: - return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v) - case OpRoundScaledResidueMaskedFloat32x4: - return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v) - case OpRoundScaledResidueMaskedFloat32x8: - return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v) - case OpRoundScaledResidueMaskedFloat64x2: - return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v) - case OpRoundScaledResidueMaskedFloat64x4: - return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v) - case OpRoundScaledResidueMaskedFloat64x8: - return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v) case OpRoundToEven: return rewriteValueAMD64_OpRoundToEven(v) + case OpRoundToEvenFloat32x4: + return rewriteValueAMD64_OpRoundToEvenFloat32x4(v) + case OpRoundToEvenFloat32x8: + return rewriteValueAMD64_OpRoundToEvenFloat32x8(v) + case OpRoundToEvenFloat64x2: + return rewriteValueAMD64_OpRoundToEvenFloat64x2(v) + case OpRoundToEvenFloat64x4: + return rewriteValueAMD64_OpRoundToEvenFloat64x4(v) + case OpRoundToEvenScaledFloat32x16: + return rewriteValueAMD64_OpRoundToEvenScaledFloat32x16(v) + case OpRoundToEvenScaledFloat32x4: + return rewriteValueAMD64_OpRoundToEvenScaledFloat32x4(v) + case OpRoundToEvenScaledFloat32x8: + return rewriteValueAMD64_OpRoundToEvenScaledFloat32x8(v) + case OpRoundToEvenScaledFloat64x2: + return rewriteValueAMD64_OpRoundToEvenScaledFloat64x2(v) + case OpRoundToEvenScaledFloat64x4: + return rewriteValueAMD64_OpRoundToEvenScaledFloat64x4(v) + case OpRoundToEvenScaledFloat64x8: + return rewriteValueAMD64_OpRoundToEvenScaledFloat64x8(v) + case OpRoundToEvenScaledMaskedFloat32x16: + return rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat32x16(v) + case OpRoundToEvenScaledMaskedFloat32x4: + return rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat32x4(v) + case OpRoundToEvenScaledMaskedFloat32x8: + return rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat32x8(v) + case OpRoundToEvenScaledMaskedFloat64x2: + return rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat64x2(v) + case OpRoundToEvenScaledMaskedFloat64x4: + return rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat64x4(v) + case OpRoundToEvenScaledMaskedFloat64x8: + return rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat64x8(v) + case OpRoundToEvenScaledResidueFloat32x16: + return rewriteValueAMD64_OpRoundToEvenScaledResidueFloat32x16(v) + case OpRoundToEvenScaledResidueFloat32x4: + return rewriteValueAMD64_OpRoundToEvenScaledResidueFloat32x4(v) + case OpRoundToEvenScaledResidueFloat32x8: + return rewriteValueAMD64_OpRoundToEvenScaledResidueFloat32x8(v) + case OpRoundToEvenScaledResidueFloat64x2: + return rewriteValueAMD64_OpRoundToEvenScaledResidueFloat64x2(v) + case OpRoundToEvenScaledResidueFloat64x4: + return rewriteValueAMD64_OpRoundToEvenScaledResidueFloat64x4(v) + case OpRoundToEvenScaledResidueFloat64x8: + return rewriteValueAMD64_OpRoundToEvenScaledResidueFloat64x8(v) + case OpRoundToEvenScaledResidueMaskedFloat32x16: + return rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat32x16(v) + case OpRoundToEvenScaledResidueMaskedFloat32x4: + return rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat32x4(v) + case OpRoundToEvenScaledResidueMaskedFloat32x8: + return rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat32x8(v) + case OpRoundToEvenScaledResidueMaskedFloat64x2: + return rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat64x2(v) + case OpRoundToEvenScaledResidueMaskedFloat64x4: + return rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat64x4(v) + case OpRoundToEvenScaledResidueMaskedFloat64x8: + return rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat64x8(v) case OpRsh16Ux16: return rewriteValueAMD64_OpRsh16Ux16(v) case OpRsh16Ux32: @@ -4359,51 +4416,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpRsh8x64(v) case OpRsh8x8: return rewriteValueAMD64_OpRsh8x8(v) - case OpSaturatedAddDotProdInt32x16: - v.Op = OpAMD64VPDPWSSDS512 - return true - case OpSaturatedAddDotProdInt32x4: - v.Op = OpAMD64VPDPWSSDS128 - return true - case OpSaturatedAddDotProdInt32x8: - v.Op = OpAMD64VPDPWSSDS256 - return true - case OpSaturatedAddDotProdMaskedInt32x16: - return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x16(v) - case OpSaturatedAddDotProdMaskedInt32x4: - return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v) - case OpSaturatedAddDotProdMaskedInt32x8: - return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v) - case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16: - return rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v) - case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32: - return rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32(v) - case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64: - return rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64(v) - case OpSaturatedUnsignedSignedPairDotProdUint8x16: - v.Op = OpAMD64VPMADDUBSW128 - return true - case OpSaturatedUnsignedSignedPairDotProdUint8x32: - v.Op = OpAMD64VPMADDUBSW256 - return true - case OpSaturatedUnsignedSignedPairDotProdUint8x64: - v.Op = OpAMD64VPMADDUBSW512 - return true - case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16: - v.Op = OpAMD64VPDPBUSDS512 - return true - case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4: - v.Op = OpAMD64VPDPBUSDS128 - return true - case OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8: - v.Op = OpAMD64VPDPBUSDS256 - return true - case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16: - return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v) - case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4: - return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v) - case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8: - return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v) case OpScaleFloat32x16: v.Op = OpAMD64VSCALEFPS512 return true @@ -5246,24 +5258,6 @@ func rewriteValueAMD64(v *Value) bool { case OpSignExt8to64: v.Op = OpAMD64MOVBQSX return true - case OpSignInt16x16: - v.Op = OpAMD64VPSIGNW256 - return true - case OpSignInt16x8: - v.Op = OpAMD64VPSIGNW128 - return true - case OpSignInt32x4: - v.Op = OpAMD64VPSIGND128 - return true - case OpSignInt32x8: - v.Op = OpAMD64VPSIGND256 - return true - case OpSignInt8x16: - v.Op = OpAMD64VPSIGNB128 - return true - case OpSignInt8x32: - v.Op = OpAMD64VPSIGNB256 - return true case OpSlicemask: return rewriteValueAMD64_OpSlicemask(v) case OpSpectreIndex: @@ -5563,22 +5557,22 @@ func rewriteValueAMD64(v *Value) bool { case OpSubSaturatedMaskedUint8x64: return rewriteValueAMD64_OpSubSaturatedMaskedUint8x64(v) case OpSubSaturatedUint16x16: - v.Op = OpAMD64VPSUBSW256 + v.Op = OpAMD64VPSUBUSW256 return true case OpSubSaturatedUint16x32: - v.Op = OpAMD64VPSUBSW512 + v.Op = OpAMD64VPSUBUSW512 return true case OpSubSaturatedUint16x8: - v.Op = OpAMD64VPSUBSW128 + v.Op = OpAMD64VPSUBUSW128 return true case OpSubSaturatedUint8x16: - v.Op = OpAMD64VPSUBSB128 + v.Op = OpAMD64VPSUBUSB128 return true case OpSubSaturatedUint8x32: - v.Op = OpAMD64VPSUBSB256 + v.Op = OpAMD64VPSUBUSB256 return true case OpSubSaturatedUint8x64: - v.Op = OpAMD64VPSUBSB512 + v.Op = OpAMD64VPSUBUSB512 return true case OpSubUint16x16: v.Op = OpAMD64VPSUBW256 @@ -5695,21 +5689,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x4(v) case OpTruncScaledResidueMaskedFloat64x8: return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x8(v) - case OpUnsignedSignedQuadDotProdAccumulateInt32x16: - v.Op = OpAMD64VPDPBUSD512 - return true - case OpUnsignedSignedQuadDotProdAccumulateInt32x4: - v.Op = OpAMD64VPDPBUSD128 - return true - case OpUnsignedSignedQuadDotProdAccumulateInt32x8: - v.Op = OpAMD64VPDPBUSD256 - return true - case OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16: - return rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v) - case OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x4: - return rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v) - case OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8: - return rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v) case OpWB: v.Op = OpAMD64LoweredWB return true @@ -28619,11 +28598,11 @@ func rewriteValueAMD64_OpAMD64XORQmodify(v *Value) bool { } return false } -func rewriteValueAMD64_OpAbsoluteMaskedInt16x16(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt16x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt16x16 x mask) + // match: (AbsMaskedInt16x16 x mask) // result: (VPABSWMasked256 x (VPMOVVec16x16ToM mask)) for { x := v_0 @@ -28635,11 +28614,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt16x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt16x32(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt16x32(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt16x32 x mask) + // match: (AbsMaskedInt16x32 x mask) // result: (VPABSWMasked512 x (VPMOVVec16x32ToM mask)) for { x := v_0 @@ -28651,11 +28630,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt16x32(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt16x8(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt16x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt16x8 x mask) + // match: (AbsMaskedInt16x8 x mask) // result: (VPABSWMasked128 x (VPMOVVec16x8ToM mask)) for { x := v_0 @@ -28667,11 +28646,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt16x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt32x16(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt32x16 x mask) + // match: (AbsMaskedInt32x16 x mask) // result: (VPABSDMasked512 x (VPMOVVec32x16ToM mask)) for { x := v_0 @@ -28683,11 +28662,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt32x4(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt32x4 x mask) + // match: (AbsMaskedInt32x4 x mask) // result: (VPABSDMasked128 x (VPMOVVec32x4ToM mask)) for { x := v_0 @@ -28699,11 +28678,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt32x8(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt32x8 x mask) + // match: (AbsMaskedInt32x8 x mask) // result: (VPABSDMasked256 x (VPMOVVec32x8ToM mask)) for { x := v_0 @@ -28715,11 +28694,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt64x2(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt64x2 x mask) + // match: (AbsMaskedInt64x2 x mask) // result: (VPABSQMasked128 x (VPMOVVec64x2ToM mask)) for { x := v_0 @@ -28731,11 +28710,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt64x4(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt64x4 x mask) + // match: (AbsMaskedInt64x4 x mask) // result: (VPABSQMasked256 x (VPMOVVec64x4ToM mask)) for { x := v_0 @@ -28747,11 +28726,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt64x8(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt64x8 x mask) + // match: (AbsMaskedInt64x8 x mask) // result: (VPABSQMasked512 x (VPMOVVec64x8ToM mask)) for { x := v_0 @@ -28763,11 +28742,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt8x16(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt8x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt8x16 x mask) + // match: (AbsMaskedInt8x16 x mask) // result: (VPABSBMasked128 x (VPMOVVec8x16ToM mask)) for { x := v_0 @@ -28779,11 +28758,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt8x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt8x32(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt8x32(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt8x32 x mask) + // match: (AbsMaskedInt8x32 x mask) // result: (VPABSBMasked256 x (VPMOVVec8x32ToM mask)) for { x := v_0 @@ -28795,11 +28774,11 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt8x32(v *Value) bool { return true } } -func rewriteValueAMD64_OpAbsoluteMaskedInt8x64(v *Value) bool { +func rewriteValueAMD64_OpAbsMaskedInt8x64(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AbsoluteMaskedInt8x64 x mask) + // match: (AbsMaskedInt8x64 x mask) // result: (VPABSBMasked512 x (VPMOVVec8x64ToM mask)) for { x := v_0 @@ -28811,60 +28790,180 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt8x64(v *Value) bool { return true } } -func rewriteValueAMD64_OpAddDotProdMaskedInt32x16(v *Value) bool { +func rewriteValueAMD64_OpAddDotProdPairsSaturatedMaskedInt32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdPairsSaturatedMaskedInt32x16 x y z mask) + // result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpAddDotProdPairsSaturatedMaskedInt32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdPairsSaturatedMaskedInt32x4 x y z mask) + // result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpAddDotProdPairsSaturatedMaskedInt32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdPairsSaturatedMaskedInt32x8 x y z mask) + // result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpAddDotProdQuadrupleMaskedInt32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdQuadrupleMaskedInt32x16 x y z mask) + // result: (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpAddDotProdQuadrupleMaskedInt32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdQuadrupleMaskedInt32x4 x y z mask) + // result: (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpAddDotProdQuadrupleMaskedInt32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdQuadrupleMaskedInt32x8 x y z mask) + // result: (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPBUSDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpAddDotProdQuadrupleSaturatedMaskedInt32x16(v *Value) bool { v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AddDotProdMaskedInt32x16 x y z mask) - // result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM mask)) + // match: (AddDotProdQuadrupleSaturatedMaskedInt32x16 x y z mask) + // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) for { x := v_0 y := v_1 z := v_2 mask := v_3 - v.reset(OpAMD64VPDPWSSDMasked512) + v.reset(OpAMD64VPDPBUSDSMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) v0.AddArg(mask) v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpAddDotProdMaskedInt32x4(v *Value) bool { +func rewriteValueAMD64_OpAddDotProdQuadrupleSaturatedMaskedInt32x4(v *Value) bool { v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AddDotProdMaskedInt32x4 x y z mask) - // result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM mask)) + // match: (AddDotProdQuadrupleSaturatedMaskedInt32x4 x y z mask) + // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) for { x := v_0 y := v_1 z := v_2 mask := v_3 - v.reset(OpAMD64VPDPWSSDMasked128) + v.reset(OpAMD64VPDPBUSDSMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) v0.AddArg(mask) v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpAddDotProdMaskedInt32x8(v *Value) bool { +func rewriteValueAMD64_OpAddDotProdQuadrupleSaturatedMaskedInt32x8(v *Value) bool { v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (AddDotProdMaskedInt32x8 x y z mask) - // result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM mask)) + // match: (AddDotProdQuadrupleSaturatedMaskedInt32x8 x y z mask) + // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) for { x := v_0 y := v_1 z := v_2 mask := v_3 - v.reset(OpAMD64VPDPWSSDMasked256) + v.reset(OpAMD64VPDPBUSDSMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) v0.AddArg(mask) v.AddArg4(x, y, z, v0) @@ -29525,12 +29624,12 @@ func rewriteValueAMD64_OpAddSaturatedMaskedUint16x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (AddSaturatedMaskedUint16x16 x y mask) - // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) + // result: (VPADDUSWMasked256 x y (VPMOVVec16x16ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPADDSWMasked256) + v.reset(OpAMD64VPADDUSWMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -29543,12 +29642,12 @@ func rewriteValueAMD64_OpAddSaturatedMaskedUint16x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (AddSaturatedMaskedUint16x32 x y mask) - // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) + // result: (VPADDUSWMasked512 x y (VPMOVVec16x32ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPADDSWMasked512) + v.reset(OpAMD64VPADDUSWMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -29561,12 +29660,12 @@ func rewriteValueAMD64_OpAddSaturatedMaskedUint16x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (AddSaturatedMaskedUint16x8 x y mask) - // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) + // result: (VPADDUSWMasked128 x y (VPMOVVec16x8ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPADDSWMasked128) + v.reset(OpAMD64VPADDUSWMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -29579,12 +29678,12 @@ func rewriteValueAMD64_OpAddSaturatedMaskedUint8x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (AddSaturatedMaskedUint8x16 x y mask) - // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) + // result: (VPADDUSBMasked128 x y (VPMOVVec8x16ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPADDSBMasked128) + v.reset(OpAMD64VPADDUSBMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -29597,12 +29696,12 @@ func rewriteValueAMD64_OpAddSaturatedMaskedUint8x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (AddSaturatedMaskedUint8x32 x y mask) - // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) + // result: (VPADDUSBMasked256 x y (VPMOVVec8x32ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPADDSBMasked256) + v.reset(OpAMD64VPADDUSBMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -29615,12 +29714,12 @@ func rewriteValueAMD64_OpAddSaturatedMaskedUint8x64(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (AddSaturatedMaskedUint8x64 x y mask) - // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) + // result: (VPADDUSBMasked512 x y (VPMOVVec8x64ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPADDSBMasked512) + v.reset(OpAMD64VPADDUSBMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -30072,198 +30171,6 @@ func rewriteValueAMD64_OpAndNotMaskedUint64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpApproximateReciprocalMaskedFloat32x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalMaskedFloat32x16 x mask) - // result: (VRCP14PSMasked512 x (VPMOVVec32x16ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRCP14PSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalMaskedFloat32x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalMaskedFloat32x4 x mask) - // result: (VRCP14PSMasked128 x (VPMOVVec32x4ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRCP14PSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalMaskedFloat32x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalMaskedFloat32x8 x mask) - // result: (VRCP14PSMasked256 x (VPMOVVec32x8ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRCP14PSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalMaskedFloat64x2(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalMaskedFloat64x2 x mask) - // result: (VRCP14PDMasked128 x (VPMOVVec64x2ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRCP14PDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalMaskedFloat64x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalMaskedFloat64x4 x mask) - // result: (VRCP14PDMasked256 x (VPMOVVec64x4ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRCP14PDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalMaskedFloat64x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalMaskedFloat64x8 x mask) - // result: (VRCP14PDMasked512 x (VPMOVVec64x8ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRCP14PDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat32x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalOfSqrtMaskedFloat32x16 x mask) - // result: (VRSQRT14PSMasked512 x (VPMOVVec32x16ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRSQRT14PSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat32x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalOfSqrtMaskedFloat32x4 x mask) - // result: (VRSQRT14PSMasked128 x (VPMOVVec32x4ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRSQRT14PSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat32x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalOfSqrtMaskedFloat32x8 x mask) - // result: (VRSQRT14PSMasked256 x (VPMOVVec32x8ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRSQRT14PSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat64x2(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalOfSqrtMaskedFloat64x2 x mask) - // result: (VRSQRT14PDMasked128 x (VPMOVVec64x2ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRSQRT14PDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat64x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalOfSqrtMaskedFloat64x4 x mask) - // result: (VRSQRT14PDMasked256 x (VPMOVVec64x4ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRSQRT14PDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpApproximateReciprocalOfSqrtMaskedFloat64x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (ApproximateReciprocalOfSqrtMaskedFloat64x8 x mask) - // result: (VRSQRT14PDMasked512 x (VPMOVVec64x8ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VRSQRT14PDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} func rewriteValueAMD64_OpAtomicAdd32(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -33709,45 +33616,111 @@ func rewriteValueAMD64_OpDivMaskedFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpDotProdBroadcastFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpDotProdPairsMaskedInt16x16(v *Value) bool { + v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] - // match: (DotProdBroadcastFloat32x4 x y) - // result: (VDPPS128 [127] x y) + b := v.Block + // match: (DotProdPairsMaskedInt16x16 x y mask) + // result: (VPMADDWDMasked256 x y (VPMOVVec16x16ToM mask)) for { x := v_0 y := v_1 - v.reset(OpAMD64VDPPS128) - v.AuxInt = int8ToAuxInt(127) - v.AddArg2(x, y) + mask := v_2 + v.reset(OpAMD64VPMADDWDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) return true } } -func rewriteValueAMD64_OpDotProdBroadcastFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpDotProdPairsMaskedInt16x32(v *Value) bool { + v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] - // match: (DotProdBroadcastFloat32x8 x y) - // result: (VDPPS256 [127] x y) + b := v.Block + // match: (DotProdPairsMaskedInt16x32 x y mask) + // result: (VPMADDWDMasked512 x y (VPMOVVec16x32ToM mask)) for { x := v_0 y := v_1 - v.reset(OpAMD64VDPPS256) - v.AuxInt = int8ToAuxInt(127) - v.AddArg2(x, y) + mask := v_2 + v.reset(OpAMD64VPMADDWDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) return true } } -func rewriteValueAMD64_OpDotProdBroadcastFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpDotProdPairsMaskedInt16x8(v *Value) bool { + v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] - // match: (DotProdBroadcastFloat64x2 x y) - // result: (VDPPD128 [127] x y) + b := v.Block + // match: (DotProdPairsMaskedInt16x8 x y mask) + // result: (VPMADDWDMasked128 x y (VPMOVVec16x8ToM mask)) for { x := v_0 y := v_1 - v.reset(OpAMD64VDPPD128) - v.AuxInt = int8ToAuxInt(127) - v.AddArg2(x, y) + mask := v_2 + v.reset(OpAMD64VPMADDWDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpDotProdPairsSaturatedMaskedUint8x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (DotProdPairsSaturatedMaskedUint8x16 x y mask) + // result: (VPMADDUBSWMasked128 x y (VPMOVVec16x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMADDUBSWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpDotProdPairsSaturatedMaskedUint8x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (DotProdPairsSaturatedMaskedUint8x32 x y mask) + // result: (VPMADDUBSWMasked256 x y (VPMOVVec16x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMADDUBSWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpDotProdPairsSaturatedMaskedUint8x64(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (DotProdPairsSaturatedMaskedUint8x64 x y mask) + // result: (VPMADDUBSWMasked512 x y (VPMOVVec16x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMADDUBSWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) return true } } @@ -35694,366 +35667,6 @@ func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddMaskedFloat32x16 x y z mask) - // result: (VFMADD213PSMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADD213PSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddMaskedFloat32x4 x y z mask) - // result: (VFMADD213PSMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADD213PSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddMaskedFloat32x8 x y z mask) - // result: (VFMADD213PSMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADD213PSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat64x2(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddMaskedFloat64x2 x y z mask) - // result: (VFMADD213PDMasked128 x y z (VPMOVVec64x2ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADD213PDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat64x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddMaskedFloat64x4 x y z mask) - // result: (VFMADD213PDMasked256 x y z (VPMOVVec64x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADD213PDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat64x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddMaskedFloat64x8 x y z mask) - // result: (VFMADD213PDMasked512 x y z (VPMOVVec64x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADD213PDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddSubMaskedFloat32x16 x y z mask) - // result: (VFMADDSUB213PSMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADDSUB213PSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddSubMaskedFloat32x4 x y z mask) - // result: (VFMADDSUB213PSMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADDSUB213PSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddSubMaskedFloat32x8 x y z mask) - // result: (VFMADDSUB213PSMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADDSUB213PSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat64x2(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddSubMaskedFloat64x2 x y z mask) - // result: (VFMADDSUB213PDMasked128 x y z (VPMOVVec64x2ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADDSUB213PDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat64x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddSubMaskedFloat64x4 x y z mask) - // result: (VFMADDSUB213PDMasked256 x y z (VPMOVVec64x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADDSUB213PDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplyAddSubMaskedFloat64x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplyAddSubMaskedFloat64x8 x y z mask) - // result: (VFMADDSUB213PDMasked512 x y z (VPMOVVec64x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMADDSUB213PDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplySubAddMaskedFloat32x16 x y z mask) - // result: (VFMSUBADD213PSMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMSUBADD213PSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplySubAddMaskedFloat32x4 x y z mask) - // result: (VFMSUBADD213PSMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMSUBADD213PSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplySubAddMaskedFloat32x8 x y z mask) - // result: (VFMSUBADD213PSMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMSUBADD213PSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat64x2(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplySubAddMaskedFloat64x2 x y z mask) - // result: (VFMSUBADD213PDMasked128 x y z (VPMOVVec64x2ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMSUBADD213PDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat64x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplySubAddMaskedFloat64x4 x y z mask) - // result: (VFMSUBADD213PDMasked256 x y z (VPMOVVec64x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMSUBADD213PDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpFusedMultiplySubAddMaskedFloat64x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (FusedMultiplySubAddMaskedFloat64x8 x y z mask) - // result: (VFMSUBADD213PDMasked512 x y z (VPMOVVec64x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VFMSUBADD213PDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} func rewriteValueAMD64_OpGaloisFieldAffineTransformInverseMaskedUint8x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -44852,192 +44465,270 @@ func rewriteValueAMD64_OpMove(v *Value) bool { } return false } -func rewriteValueAMD64_OpMulEvenWidenMaskedInt64x2(v *Value) bool { +func rewriteValueAMD64_OpMulAddMaskedFloat32x16(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulEvenWidenMaskedInt64x2 x y mask) - // result: (VPMULDQMasked128 x y (VPMOVVec64x2ToM mask)) + // match: (MulAddMaskedFloat32x16 x y z mask) + // result: (VFMADD213PSMasked512 x y z (VPMOVVec32x16ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULDQMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADD213PSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulEvenWidenMaskedInt64x4(v *Value) bool { +func rewriteValueAMD64_OpMulAddMaskedFloat32x4(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulEvenWidenMaskedInt64x4 x y mask) - // result: (VPMULDQMasked256 x y (VPMOVVec64x4ToM mask)) + // match: (MulAddMaskedFloat32x4 x y z mask) + // result: (VFMADD213PSMasked128 x y z (VPMOVVec32x4ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULDQMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADD213PSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulEvenWidenMaskedInt64x8(v *Value) bool { +func rewriteValueAMD64_OpMulAddMaskedFloat32x8(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulEvenWidenMaskedInt64x8 x y mask) - // result: (VPMULDQMasked512 x y (VPMOVVec64x8ToM mask)) + // match: (MulAddMaskedFloat32x8 x y z mask) + // result: (VFMADD213PSMasked256 x y z (VPMOVVec32x8ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULDQMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADD213PSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulEvenWidenMaskedUint64x2(v *Value) bool { +func rewriteValueAMD64_OpMulAddMaskedFloat64x2(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulEvenWidenMaskedUint64x2 x y mask) - // result: (VPMULUDQMasked128 x y (VPMOVVec64x2ToM mask)) + // match: (MulAddMaskedFloat64x2 x y z mask) + // result: (VFMADD213PDMasked128 x y z (VPMOVVec64x2ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULUDQMasked128) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADD213PDMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulEvenWidenMaskedUint64x4(v *Value) bool { +func rewriteValueAMD64_OpMulAddMaskedFloat64x4(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulEvenWidenMaskedUint64x4 x y mask) - // result: (VPMULUDQMasked256 x y (VPMOVVec64x4ToM mask)) + // match: (MulAddMaskedFloat64x4 x y z mask) + // result: (VFMADD213PDMasked256 x y z (VPMOVVec64x4ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULUDQMasked256) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADD213PDMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulEvenWidenMaskedUint64x8(v *Value) bool { +func rewriteValueAMD64_OpMulAddMaskedFloat64x8(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulEvenWidenMaskedUint64x8 x y mask) - // result: (VPMULUDQMasked512 x y (VPMOVVec64x8ToM mask)) + // match: (MulAddMaskedFloat64x8 x y z mask) + // result: (VFMADD213PDMasked512 x y z (VPMOVVec64x8ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULUDQMasked512) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADD213PDMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulHighMaskedInt16x16(v *Value) bool { +func rewriteValueAMD64_OpMulAddSubMaskedFloat32x16(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulHighMaskedInt16x16 x y mask) - // result: (VPMULHWMasked256 x y (VPMOVVec16x16ToM mask)) + // match: (MulAddSubMaskedFloat32x16 x y z mask) + // result: (VFMADDSUB213PSMasked512 x y z (VPMOVVec32x16ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULHWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADDSUB213PSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulHighMaskedInt16x32(v *Value) bool { +func rewriteValueAMD64_OpMulAddSubMaskedFloat32x4(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulHighMaskedInt16x32 x y mask) - // result: (VPMULHWMasked512 x y (VPMOVVec16x32ToM mask)) + // match: (MulAddSubMaskedFloat32x4 x y z mask) + // result: (VFMADDSUB213PSMasked128 x y z (VPMOVVec32x4ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULHWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADDSUB213PSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulHighMaskedInt16x8(v *Value) bool { +func rewriteValueAMD64_OpMulAddSubMaskedFloat32x8(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulHighMaskedInt16x8 x y mask) - // result: (VPMULHWMasked128 x y (VPMOVVec16x8ToM mask)) + // match: (MulAddSubMaskedFloat32x8 x y z mask) + // result: (VFMADDSUB213PSMasked256 x y z (VPMOVVec32x8ToM mask)) for { x := v_0 y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULHWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADDSUB213PSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) v0.AddArg(mask) - v.AddArg3(x, y, v0) + v.AddArg4(x, y, z, v0) return true } } -func rewriteValueAMD64_OpMulHighMaskedUint16x16(v *Value) bool { +func rewriteValueAMD64_OpMulAddSubMaskedFloat64x2(v *Value) bool { + v_3 := v.Args[3] v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulHighMaskedUint16x16 x y mask) - // result: (VPMULHUWMasked256 x y (VPMOVVec16x16ToM mask)) + // match: (MulAddSubMaskedFloat64x2 x y z mask) + // result: (VFMADDSUB213PDMasked128 x y z (VPMOVVec64x2ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADDSUB213PDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMulAddSubMaskedFloat64x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulAddSubMaskedFloat64x4 x y z mask) + // result: (VFMADDSUB213PDMasked256 x y z (VPMOVVec64x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADDSUB213PDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMulAddSubMaskedFloat64x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulAddSubMaskedFloat64x8 x y z mask) + // result: (VFMADDSUB213PDMasked512 x y z (VPMOVVec64x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMADDSUB213PDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMulHighMaskedInt16x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulHighMaskedInt16x16 x y mask) + // result: (VPMULHWMasked256 x y (VPMOVVec16x16ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPMULHUWMasked256) + v.reset(OpAMD64VPMULHWMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) return true } } -func rewriteValueAMD64_OpMulHighMaskedUint16x32(v *Value) bool { +func rewriteValueAMD64_OpMulHighMaskedInt16x32(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulHighMaskedUint16x32 x y mask) + // match: (MulHighMaskedInt16x32 x y mask) // result: (VPMULHUWMasked512 x y (VPMOVVec16x32ToM mask)) for { x := v_0 @@ -45050,12 +44741,12 @@ func rewriteValueAMD64_OpMulHighMaskedUint16x32(v *Value) bool { return true } } -func rewriteValueAMD64_OpMulHighMaskedUint16x8(v *Value) bool { +func rewriteValueAMD64_OpMulHighMaskedInt16x8(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (MulHighMaskedUint16x8 x y mask) + // match: (MulHighMaskedInt16x8 x y mask) // result: (VPMULHUWMasked128 x y (VPMOVVec16x8ToM mask)) for { x := v_0 @@ -45338,6 +45029,288 @@ func rewriteValueAMD64_OpMulMaskedInt64x8(v *Value) bool { return true } } +func rewriteValueAMD64_OpMulMaskedUint16x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint16x16 x y mask) + // result: (VPMULLWMasked256 x y (VPMOVVec16x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedUint16x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint16x32 x y mask) + // result: (VPMULLWMasked512 x y (VPMOVVec16x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedUint16x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint16x8 x y mask) + // result: (VPMULLWMasked128 x y (VPMOVVec16x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedUint32x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint32x16 x y mask) + // result: (VPMULLDMasked512 x y (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedUint32x4(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint32x4 x y mask) + // result: (VPMULLDMasked128 x y (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedUint32x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint32x8 x y mask) + // result: (VPMULLDMasked256 x y (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedUint64x2(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint64x2 x y mask) + // result: (VPMULLQMasked128 x y (VPMOVVec64x2ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedUint64x4(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint64x4 x y mask) + // result: (VPMULLQMasked256 x y (VPMOVVec64x4ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedUint64x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedUint64x8 x y mask) + // result: (VPMULLQMasked512 x y (VPMOVVec64x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulSubAddMaskedFloat32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulSubAddMaskedFloat32x16 x y z mask) + // result: (VFMSUBADD213PSMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMSUBADD213PSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMulSubAddMaskedFloat32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulSubAddMaskedFloat32x4 x y z mask) + // result: (VFMSUBADD213PSMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMSUBADD213PSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMulSubAddMaskedFloat32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulSubAddMaskedFloat32x8 x y z mask) + // result: (VFMSUBADD213PSMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMSUBADD213PSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMulSubAddMaskedFloat64x2(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulSubAddMaskedFloat64x2 x y z mask) + // result: (VFMSUBADD213PDMasked128 x y z (VPMOVVec64x2ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMSUBADD213PDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMulSubAddMaskedFloat64x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulSubAddMaskedFloat64x4 x y z mask) + // result: (VFMSUBADD213PDMasked256 x y z (VPMOVVec64x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMSUBADD213PDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpMulSubAddMaskedFloat64x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulSubAddMaskedFloat64x8 x y z mask) + // result: (VFMSUBADD213PDMasked512 x y z (VPMOVVec64x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VFMSUBADD213PDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} func rewriteValueAMD64_OpNeg32F(v *Value) bool { v_0 := v.Args[0] b := v.Block @@ -46722,6 +46695,390 @@ func rewriteValueAMD64_OpOffPtr(v *Value) bool { return true } } +func rewriteValueAMD64_OpOnesCountMaskedInt16x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt16x16 x mask) + // result: (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt16x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt16x32 x mask) + // result: (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt16x8 x mask) + // result: (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt32x16 x mask) + // result: (VPOPCNTDMasked512 x (VPMOVVec32x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt32x4 x mask) + // result: (VPOPCNTDMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt32x8 x mask) + // result: (VPOPCNTDMasked256 x (VPMOVVec32x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt64x2 x mask) + // result: (VPOPCNTQMasked128 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt64x4 x mask) + // result: (VPOPCNTQMasked256 x (VPMOVVec64x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt64x8 x mask) + // result: (VPOPCNTQMasked512 x (VPMOVVec64x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt8x16 x mask) + // result: (VPOPCNTBMasked128 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt8x32 x mask) + // result: (VPOPCNTBMasked256 x (VPMOVVec8x32ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedInt8x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedInt8x64 x mask) + // result: (VPOPCNTBMasked512 x (VPMOVVec8x64ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint16x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint16x16 x mask) + // result: (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint16x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint16x32 x mask) + // result: (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint16x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint16x8 x mask) + // result: (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint32x16 x mask) + // result: (VPOPCNTDMasked512 x (VPMOVVec32x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint32x4 x mask) + // result: (VPOPCNTDMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint32x8 x mask) + // result: (VPOPCNTDMasked256 x (VPMOVVec32x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint64x2 x mask) + // result: (VPOPCNTQMasked128 x (VPMOVVec64x2ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint64x4 x mask) + // result: (VPOPCNTQMasked256 x (VPMOVVec64x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint64x8 x mask) + // result: (VPOPCNTQMasked512 x (VPMOVVec64x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint8x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint8x16 x mask) + // result: (VPOPCNTBMasked128 x (VPMOVVec8x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint8x32 x mask) + // result: (VPOPCNTBMasked256 x (VPMOVVec8x32ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpOnesCountMaskedUint8x64(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (OnesCountMaskedUint8x64 x mask) + // result: (VPOPCNTBMasked512 x (VPMOVVec8x64ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VPOPCNTBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} func rewriteValueAMD64_OpOrMaskedInt32x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -46938,60 +47295,6 @@ func rewriteValueAMD64_OpOrMaskedUint64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpPairDotProdMaskedInt16x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PairDotProdMaskedInt16x16 x y mask) - // result: (VPMADDWDMasked256 x y (VPMOVVec16x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMADDWDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpPairDotProdMaskedInt16x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PairDotProdMaskedInt16x32 x y mask) - // result: (VPMADDWDMasked512 x y (VPMOVVec16x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMADDWDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpPairDotProdMaskedInt16x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PairDotProdMaskedInt16x8 x y mask) - // result: (VPMADDWDMasked128 x y (VPMOVVec16x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMADDWDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} func rewriteValueAMD64_OpPermute2MaskedFloat32x16(v *Value) bool { v_3 := v.Args[3] v_2 := v.Args[2] @@ -48054,390 +48357,198 @@ func rewriteValueAMD64_OpPopCount8(v *Value) bool { return true } } -func rewriteValueAMD64_OpPopCountMaskedInt16x16(v *Value) bool { +func rewriteValueAMD64_OpReciprocalMaskedFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedInt16x16 x mask) - // result: (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedInt16x32(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedInt16x32 x mask) - // result: (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedInt16x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedInt16x8 x mask) - // result: (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedInt32x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedInt32x16 x mask) - // result: (VPOPCNTDMasked512 x (VPMOVVec32x16ToM mask)) + // match: (ReciprocalMaskedFloat32x16 x mask) + // result: (VRCP14PSMasked512 x (VPMOVVec32x16ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTDMasked512) + v.reset(OpAMD64VRCP14PSMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedInt32x4(v *Value) bool { +func rewriteValueAMD64_OpReciprocalMaskedFloat32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedInt32x4 x mask) - // result: (VPOPCNTDMasked128 x (VPMOVVec32x4ToM mask)) + // match: (ReciprocalMaskedFloat32x4 x mask) + // result: (VRCP14PSMasked128 x (VPMOVVec32x4ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTDMasked128) + v.reset(OpAMD64VRCP14PSMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedInt32x8(v *Value) bool { +func rewriteValueAMD64_OpReciprocalMaskedFloat32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedInt32x8 x mask) - // result: (VPOPCNTDMasked256 x (VPMOVVec32x8ToM mask)) + // match: (ReciprocalMaskedFloat32x8 x mask) + // result: (VRCP14PSMasked256 x (VPMOVVec32x8ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTDMasked256) + v.reset(OpAMD64VRCP14PSMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedInt64x2(v *Value) bool { +func rewriteValueAMD64_OpReciprocalMaskedFloat64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedInt64x2 x mask) - // result: (VPOPCNTQMasked128 x (VPMOVVec64x2ToM mask)) + // match: (ReciprocalMaskedFloat64x2 x mask) + // result: (VRCP14PDMasked128 x (VPMOVVec64x2ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTQMasked128) + v.reset(OpAMD64VRCP14PDMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedInt64x4(v *Value) bool { +func rewriteValueAMD64_OpReciprocalMaskedFloat64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedInt64x4 x mask) - // result: (VPOPCNTQMasked256 x (VPMOVVec64x4ToM mask)) + // match: (ReciprocalMaskedFloat64x4 x mask) + // result: (VRCP14PDMasked256 x (VPMOVVec64x4ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTQMasked256) + v.reset(OpAMD64VRCP14PDMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedInt64x8(v *Value) bool { +func rewriteValueAMD64_OpReciprocalMaskedFloat64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedInt64x8 x mask) - // result: (VPOPCNTQMasked512 x (VPMOVVec64x8ToM mask)) + // match: (ReciprocalMaskedFloat64x8 x mask) + // result: (VRCP14PDMasked512 x (VPMOVVec64x8ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTQMasked512) + v.reset(OpAMD64VRCP14PDMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedInt8x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedInt8x16 x mask) - // result: (VPOPCNTBMasked128 x (VPMOVVec8x16ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTBMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedInt8x32(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedInt8x32 x mask) - // result: (VPOPCNTBMasked256 x (VPMOVVec8x32ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTBMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedInt8x64(v *Value) bool { +func rewriteValueAMD64_OpReciprocalSqrtMaskedFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedInt8x64 x mask) - // result: (VPOPCNTBMasked512 x (VPMOVVec8x64ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTBMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedUint16x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedUint16x16 x mask) - // result: (VPOPCNTWMasked256 x (VPMOVVec16x16ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedUint16x32(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedUint16x32 x mask) - // result: (VPOPCNTWMasked512 x (VPMOVVec16x32ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedUint16x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedUint16x8 x mask) - // result: (VPOPCNTWMasked128 x (VPMOVVec16x8ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedUint32x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedUint32x16 x mask) - // result: (VPOPCNTDMasked512 x (VPMOVVec32x16ToM mask)) + // match: (ReciprocalSqrtMaskedFloat32x16 x mask) + // result: (VRSQRT14PSMasked512 x (VPMOVVec32x16ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTDMasked512) + v.reset(OpAMD64VRSQRT14PSMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedUint32x4(v *Value) bool { +func rewriteValueAMD64_OpReciprocalSqrtMaskedFloat32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedUint32x4 x mask) - // result: (VPOPCNTDMasked128 x (VPMOVVec32x4ToM mask)) + // match: (ReciprocalSqrtMaskedFloat32x4 x mask) + // result: (VRSQRT14PSMasked128 x (VPMOVVec32x4ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTDMasked128) + v.reset(OpAMD64VRSQRT14PSMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedUint32x8(v *Value) bool { +func rewriteValueAMD64_OpReciprocalSqrtMaskedFloat32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedUint32x8 x mask) - // result: (VPOPCNTDMasked256 x (VPMOVVec32x8ToM mask)) + // match: (ReciprocalSqrtMaskedFloat32x8 x mask) + // result: (VRSQRT14PSMasked256 x (VPMOVVec32x8ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTDMasked256) + v.reset(OpAMD64VRSQRT14PSMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedUint64x2(v *Value) bool { +func rewriteValueAMD64_OpReciprocalSqrtMaskedFloat64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedUint64x2 x mask) - // result: (VPOPCNTQMasked128 x (VPMOVVec64x2ToM mask)) + // match: (ReciprocalSqrtMaskedFloat64x2 x mask) + // result: (VRSQRT14PDMasked128 x (VPMOVVec64x2ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTQMasked128) + v.reset(OpAMD64VRSQRT14PDMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedUint64x4(v *Value) bool { +func rewriteValueAMD64_OpReciprocalSqrtMaskedFloat64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedUint64x4 x mask) - // result: (VPOPCNTQMasked256 x (VPMOVVec64x4ToM mask)) + // match: (ReciprocalSqrtMaskedFloat64x4 x mask) + // result: (VRSQRT14PDMasked256 x (VPMOVVec64x4ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTQMasked256) + v.reset(OpAMD64VRSQRT14PDMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedUint64x8(v *Value) bool { +func rewriteValueAMD64_OpReciprocalSqrtMaskedFloat64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (PopCountMaskedUint64x8 x mask) - // result: (VPOPCNTQMasked512 x (VPMOVVec64x8ToM mask)) + // match: (ReciprocalSqrtMaskedFloat64x8 x mask) + // result: (VRSQRT14PDMasked512 x (VPMOVVec64x8ToM mask)) for { x := v_0 mask := v_1 - v.reset(OpAMD64VPOPCNTQMasked512) + v.reset(OpAMD64VRSQRT14PDMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) v0.AddArg(mask) v.AddArg2(x, v0) return true } } -func rewriteValueAMD64_OpPopCountMaskedUint8x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedUint8x16 x mask) - // result: (VPOPCNTBMasked128 x (VPMOVVec8x16ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTBMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedUint8x32(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedUint8x32 x mask) - // result: (VPOPCNTBMasked256 x (VPMOVVec8x32ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTBMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpPopCountMaskedUint8x64(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PopCountMaskedUint8x64 x mask) - // result: (VPOPCNTBMasked512 x (VPMOVVec8x64ToM mask)) - for { - x := v_0 - mask := v_1 - v.reset(OpAMD64VPOPCNTBMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} func rewriteValueAMD64_OpRotateAllLeftMaskedInt32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -49302,9 +49413,21 @@ func rewriteValueAMD64_OpRotateRightMaskedUint64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEven(v *Value) bool { + v_0 := v.Args[0] + // match: (RoundToEven x) + // result: (ROUNDSD [0] x) + for { + x := v_0 + v.reset(OpAMD64ROUNDSD) + v.AuxInt = int8ToAuxInt(0) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpRoundToEvenFloat32x4(v *Value) bool { v_0 := v.Args[0] - // match: (RoundFloat32x4 x) + // match: (RoundToEvenFloat32x4 x) // result: (VROUNDPS128 [0] x) for { x := v_0 @@ -49314,9 +49437,9 @@ func rewriteValueAMD64_OpRoundFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenFloat32x8(v *Value) bool { v_0 := v.Args[0] - // match: (RoundFloat32x8 x) + // match: (RoundToEvenFloat32x8 x) // result: (VROUNDPS256 [0] x) for { x := v_0 @@ -49326,9 +49449,9 @@ func rewriteValueAMD64_OpRoundFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenFloat64x2(v *Value) bool { v_0 := v.Args[0] - // match: (RoundFloat64x2 x) + // match: (RoundToEvenFloat64x2 x) // result: (VROUNDPD128 [0] x) for { x := v_0 @@ -49338,9 +49461,9 @@ func rewriteValueAMD64_OpRoundFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenFloat64x4(v *Value) bool { v_0 := v.Args[0] - // match: (RoundFloat64x4 x) + // match: (RoundToEvenFloat64x4 x) // result: (VROUNDPD256 [0] x) for { x := v_0 @@ -49350,9 +49473,9 @@ func rewriteValueAMD64_OpRoundFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledFloat32x16(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledFloat32x16 [a] x) + // match: (RoundToEvenScaledFloat32x16 [a] x) // result: (VRNDSCALEPS512 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49363,9 +49486,9 @@ func rewriteValueAMD64_OpRoundScaledFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledFloat32x4(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledFloat32x4 [a] x) + // match: (RoundToEvenScaledFloat32x4 [a] x) // result: (VRNDSCALEPS128 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49376,9 +49499,9 @@ func rewriteValueAMD64_OpRoundScaledFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledFloat32x8(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledFloat32x8 [a] x) + // match: (RoundToEvenScaledFloat32x8 [a] x) // result: (VRNDSCALEPS256 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49389,9 +49512,9 @@ func rewriteValueAMD64_OpRoundScaledFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledFloat64x2(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledFloat64x2 [a] x) + // match: (RoundToEvenScaledFloat64x2 [a] x) // result: (VRNDSCALEPD128 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49402,9 +49525,9 @@ func rewriteValueAMD64_OpRoundScaledFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledFloat64x4(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledFloat64x4 [a] x) + // match: (RoundToEvenScaledFloat64x4 [a] x) // result: (VRNDSCALEPD256 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49415,9 +49538,9 @@ func rewriteValueAMD64_OpRoundScaledFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledFloat64x8(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledFloat64x8 [a] x) + // match: (RoundToEvenScaledFloat64x8 [a] x) // result: (VRNDSCALEPD512 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49428,11 +49551,11 @@ func rewriteValueAMD64_OpRoundScaledFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledMaskedFloat32x16 [a] x mask) + // match: (RoundToEvenScaledMaskedFloat32x16 [a] x mask) // result: (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49446,11 +49569,11 @@ func rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledMaskedFloat32x4 [a] x mask) + // match: (RoundToEvenScaledMaskedFloat32x4 [a] x mask) // result: (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49464,11 +49587,11 @@ func rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledMaskedFloat32x8 [a] x mask) + // match: (RoundToEvenScaledMaskedFloat32x8 [a] x mask) // result: (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49482,11 +49605,11 @@ func rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledMaskedFloat64x2 [a] x mask) + // match: (RoundToEvenScaledMaskedFloat64x2 [a] x mask) // result: (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49500,11 +49623,11 @@ func rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledMaskedFloat64x4 [a] x mask) + // match: (RoundToEvenScaledMaskedFloat64x4 [a] x mask) // result: (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49518,11 +49641,11 @@ func rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledMaskedFloat64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledMaskedFloat64x8 [a] x mask) + // match: (RoundToEvenScaledMaskedFloat64x8 [a] x mask) // result: (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49536,9 +49659,9 @@ func rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueFloat32x16(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledResidueFloat32x16 [a] x) + // match: (RoundToEvenScaledResidueFloat32x16 [a] x) // result: (VREDUCEPS512 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49549,9 +49672,9 @@ func rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueFloat32x4(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledResidueFloat32x4 [a] x) + // match: (RoundToEvenScaledResidueFloat32x4 [a] x) // result: (VREDUCEPS128 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49562,9 +49685,9 @@ func rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueFloat32x8(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledResidueFloat32x8 [a] x) + // match: (RoundToEvenScaledResidueFloat32x8 [a] x) // result: (VREDUCEPS256 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49575,9 +49698,9 @@ func rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueFloat64x2(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledResidueFloat64x2 [a] x) + // match: (RoundToEvenScaledResidueFloat64x2 [a] x) // result: (VREDUCEPD128 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49588,9 +49711,9 @@ func rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueFloat64x4(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledResidueFloat64x4 [a] x) + // match: (RoundToEvenScaledResidueFloat64x4 [a] x) // result: (VREDUCEPD256 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49601,9 +49724,9 @@ func rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueFloat64x8(v *Value) bool { v_0 := v.Args[0] - // match: (RoundScaledResidueFloat64x8 [a] x) + // match: (RoundToEvenScaledResidueFloat64x8 [a] x) // result: (VREDUCEPD512 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -49614,11 +49737,11 @@ func rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledResidueMaskedFloat32x16 [a] x mask) + // match: (RoundToEvenScaledResidueMaskedFloat32x16 [a] x mask) // result: (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49632,11 +49755,11 @@ func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledResidueMaskedFloat32x4 [a] x mask) + // match: (RoundToEvenScaledResidueMaskedFloat32x4 [a] x mask) // result: (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49650,11 +49773,11 @@ func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledResidueMaskedFloat32x8 [a] x mask) + // match: (RoundToEvenScaledResidueMaskedFloat32x8 [a] x mask) // result: (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49668,11 +49791,11 @@ func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledResidueMaskedFloat64x2 [a] x mask) + // match: (RoundToEvenScaledResidueMaskedFloat64x2 [a] x mask) // result: (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49686,11 +49809,11 @@ func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledResidueMaskedFloat64x4 [a] x mask) + // match: (RoundToEvenScaledResidueMaskedFloat64x4 [a] x mask) // result: (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49704,11 +49827,11 @@ func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpRoundToEvenScaledResidueMaskedFloat64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundScaledResidueMaskedFloat64x8 [a] x mask) + // match: (RoundToEvenScaledResidueMaskedFloat64x8 [a] x mask) // result: (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -49722,18 +49845,6 @@ func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundToEven(v *Value) bool { - v_0 := v.Args[0] - // match: (RoundToEven x) - // result: (ROUNDSD [0] x) - for { - x := v_0 - v.reset(OpAMD64ROUNDSD) - v.AuxInt = int8ToAuxInt(0) - v.AddArg(x) - return true - } -} func rewriteValueAMD64_OpRsh16Ux16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -51062,180 +51173,6 @@ func rewriteValueAMD64_OpRsh8x8(v *Value) bool { } return false } -func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddDotProdMaskedInt32x16 x y z mask) - // result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddDotProdMaskedInt32x4 x y z mask) - // result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddDotProdMaskedInt32x8 x y z mask) - // result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x16 x y mask) - // result: (VPMADDUBSWMasked128 x y (VPMOVVec16x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMADDUBSWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x32 x y mask) - // result: (VPMADDUBSWMasked256 x y (VPMOVVec16x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMADDUBSWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedPairDotProdMaskedUint8x64 x y mask) - // result: (VPMADDUBSWMasked512 x y (VPMOVVec16x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMADDUBSWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) - // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) - // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) - // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} func rewriteValueAMD64_OpScaleMaskedFloat32x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -57918,12 +57855,12 @@ func rewriteValueAMD64_OpSubSaturatedMaskedUint16x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (SubSaturatedMaskedUint16x16 x y mask) - // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) + // result: (VPSUBUSWMasked256 x y (VPMOVVec16x16ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPSUBSWMasked256) + v.reset(OpAMD64VPSUBUSWMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -57936,12 +57873,12 @@ func rewriteValueAMD64_OpSubSaturatedMaskedUint16x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (SubSaturatedMaskedUint16x32 x y mask) - // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) + // result: (VPSUBUSWMasked512 x y (VPMOVVec16x32ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPSUBSWMasked512) + v.reset(OpAMD64VPSUBUSWMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -57954,12 +57891,12 @@ func rewriteValueAMD64_OpSubSaturatedMaskedUint16x8(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (SubSaturatedMaskedUint16x8 x y mask) - // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) + // result: (VPSUBUSWMasked128 x y (VPMOVVec16x8ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPSUBSWMasked128) + v.reset(OpAMD64VPSUBUSWMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -57972,12 +57909,12 @@ func rewriteValueAMD64_OpSubSaturatedMaskedUint8x16(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (SubSaturatedMaskedUint8x16 x y mask) - // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) + // result: (VPSUBUSBMasked128 x y (VPMOVVec8x16ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPSUBSBMasked128) + v.reset(OpAMD64VPSUBUSBMasked128) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -57990,12 +57927,12 @@ func rewriteValueAMD64_OpSubSaturatedMaskedUint8x32(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (SubSaturatedMaskedUint8x32 x y mask) - // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) + // result: (VPSUBUSBMasked256 x y (VPMOVVec8x32ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPSUBSBMasked256) + v.reset(OpAMD64VPSUBUSBMasked256) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -58008,12 +57945,12 @@ func rewriteValueAMD64_OpSubSaturatedMaskedUint8x64(v *Value) bool { v_0 := v.Args[0] b := v.Block // match: (SubSaturatedMaskedUint8x64 x y mask) - // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) + // result: (VPSUBUSBMasked512 x y (VPMOVVec8x64ToM mask)) for { x := v_0 y := v_1 mask := v_2 - v.reset(OpAMD64VPSUBSBMasked512) + v.reset(OpAMD64VPSUBUSBMasked512) v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) v0.AddArg(mask) v.AddArg3(x, y, v0) @@ -58452,66 +58389,6 @@ func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (UnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) - // result: (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (UnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) - // result: (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (UnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) - // result: (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} func rewriteValueAMD64_OpXorMaskedInt32x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index c7f97e03a0..4be74d9136 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -11,30 +11,30 @@ import ( const simdPackage = "simd" func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily)) { - addF(simdPackage, "Int8x16.Absolute", opLen1(ssa.OpAbsoluteInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.Absolute", opLen1(ssa.OpAbsoluteInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.Absolute", opLen1(ssa.OpAbsoluteInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.Absolute", opLen1(ssa.OpAbsoluteInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.Absolute", opLen1(ssa.OpAbsoluteInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.Absolute", opLen1(ssa.OpAbsoluteInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.Absolute", opLen1(ssa.OpAbsoluteInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.Absolute", opLen1(ssa.OpAbsoluteInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.Absolute", opLen1(ssa.OpAbsoluteInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.Absolute", opLen1(ssa.OpAbsoluteInt64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.Absolute", opLen1(ssa.OpAbsoluteInt64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x8.Absolute", opLen1(ssa.OpAbsoluteInt64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x8.AbsoluteMasked", opLen2(ssa.OpAbsoluteMaskedInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.Abs", opLen1(ssa.OpAbsInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.Abs", opLen1(ssa.OpAbsInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.Abs", opLen1(ssa.OpAbsInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.Abs", opLen1(ssa.OpAbsInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.Abs", opLen1(ssa.OpAbsInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.Abs", opLen1(ssa.OpAbsInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.Abs", opLen1(ssa.OpAbsInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.Abs", opLen1(ssa.OpAbsInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.Abs", opLen1(ssa.OpAbsInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.Abs", opLen1(ssa.OpAbsInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x4.Abs", opLen1(ssa.OpAbsInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.Abs", opLen1(ssa.OpAbsInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.AbsMasked", opLen2(ssa.OpAbsMaskedInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AbsMasked", opLen2(ssa.OpAbsMaskedInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AbsMasked", opLen2(ssa.OpAbsMaskedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.AbsMasked", opLen2(ssa.OpAbsMaskedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.AbsMasked", opLen2(ssa.OpAbsMaskedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.AbsMasked", opLen2(ssa.OpAbsMaskedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.AbsMasked", opLen2(ssa.OpAbsMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.AbsMasked", opLen2(ssa.OpAbsMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.AbsMasked", opLen2(ssa.OpAbsMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.AbsMasked", opLen2(ssa.OpAbsMaskedInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x4.AbsMasked", opLen2(ssa.OpAbsMaskedInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.AbsMasked", opLen2(ssa.OpAbsMaskedInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Add", opLen2(ssa.OpAddFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Add", opLen2(ssa.OpAddFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Add", opLen2(ssa.OpAddFloat32x16, types.TypeVec512), sys.AMD64) @@ -65,12 +65,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.AddDotProd", opLen3(ssa.OpAddDotProdInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.AddDotProd", opLen3(ssa.OpAddDotProdInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.AddDotProd", opLen3(ssa.OpAddDotProdInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.AddDotProdPairsSaturatedMasked", opLen4(ssa.OpAddDotProdPairsSaturatedMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.AddDotProdPairsSaturatedMasked", opLen4(ssa.OpAddDotProdPairsSaturatedMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.AddDotProdPairsSaturatedMasked", opLen4(ssa.OpAddDotProdPairsSaturatedMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.AddDotProdQuadrupleMasked", opLen4_31(ssa.OpAddDotProdQuadrupleMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AddDotProdQuadrupleMasked", opLen4_31(ssa.OpAddDotProdQuadrupleMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AddDotProdQuadrupleMasked", opLen4_31(ssa.OpAddDotProdQuadrupleMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.AddDotProdQuadrupleSaturatedMasked", opLen4_31(ssa.OpAddDotProdQuadrupleSaturatedMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AddDotProdQuadrupleSaturatedMasked", opLen4_31(ssa.OpAddDotProdQuadrupleSaturatedMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AddDotProdQuadrupleSaturatedMasked", opLen4_31(ssa.OpAddDotProdQuadrupleSaturatedMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.AddMasked", opLen3(ssa.OpAddMaskedFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.AddMasked", opLen3(ssa.OpAddMaskedFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.AddMasked", opLen3(ssa.OpAddMaskedFloat32x16, types.TypeVec512), sys.AMD64) @@ -215,30 +227,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.AndNotMasked", opLen3_21(ssa.OpAndNotMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.AndNotMasked", opLen3_21(ssa.OpAndNotMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.AndNotMasked", opLen3_21(ssa.OpAndNotMaskedUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.ApproximateReciprocal", opLen1(ssa.OpApproximateReciprocalFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.ApproximateReciprocalMasked", opLen2(ssa.OpApproximateReciprocalMaskedFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.ApproximateReciprocalMasked", opLen2(ssa.OpApproximateReciprocalMaskedFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.ApproximateReciprocalMasked", opLen2(ssa.OpApproximateReciprocalMaskedFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.ApproximateReciprocalMasked", opLen2(ssa.OpApproximateReciprocalMaskedFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.ApproximateReciprocalMasked", opLen2(ssa.OpApproximateReciprocalMaskedFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.ApproximateReciprocalMasked", opLen2(ssa.OpApproximateReciprocalMaskedFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.ApproximateReciprocalOfSqrt", opLen1(ssa.OpApproximateReciprocalOfSqrtFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.ApproximateReciprocalOfSqrtMasked", opLen2(ssa.OpApproximateReciprocalOfSqrtMaskedFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.ApproximateReciprocalOfSqrtMasked", opLen2(ssa.OpApproximateReciprocalOfSqrtMaskedFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.ApproximateReciprocalOfSqrtMasked", opLen2(ssa.OpApproximateReciprocalOfSqrtMaskedFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.ApproximateReciprocalOfSqrtMasked", opLen2(ssa.OpApproximateReciprocalOfSqrtMaskedFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.ApproximateReciprocalOfSqrtMasked", opLen2(ssa.OpApproximateReciprocalOfSqrtMaskedFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.ApproximateReciprocalOfSqrtMasked", opLen2(ssa.OpApproximateReciprocalOfSqrtMaskedFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.Average", opLen2(ssa.OpAverageUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.Average", opLen2(ssa.OpAverageUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x64.Average", opLen2(ssa.OpAverageUint8x64, types.TypeVec512), sys.AMD64) @@ -321,6 +309,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float32x4.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.CopySign", opLen2(ssa.OpCopySignInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.CopySign", opLen2(ssa.OpCopySignInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.CopySign", opLen2(ssa.OpCopySignInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.CopySign", opLen2(ssa.OpCopySignInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.CopySign", opLen2(ssa.OpCopySignInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.CopySign", opLen2(ssa.OpCopySignInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x4.Div", opLen2(ssa.OpDivFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Div", opLen2(ssa.OpDivFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Div", opLen2(ssa.OpDivFloat32x16, types.TypeVec512), sys.AMD64) @@ -333,9 +327,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.DivMasked", opLen3(ssa.OpDivMaskedFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.DivMasked", opLen3(ssa.OpDivMaskedFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.DivMasked", opLen3(ssa.OpDivMaskedFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.DotProdBroadcast", opLen2(ssa.OpDotProdBroadcastFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.DotProdBroadcast", opLen2(ssa.OpDotProdBroadcastFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x2.DotProdBroadcast", opLen2(ssa.OpDotProdBroadcastFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x8.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.DotProdPairsMasked", opLen3(ssa.OpDotProdPairsMaskedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.DotProdPairsMasked", opLen3(ssa.OpDotProdPairsMaskedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.DotProdPairsMasked", opLen3(ssa.OpDotProdPairsMaskedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.DotProdPairsSaturatedMasked", opLen3(ssa.OpDotProdPairsSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.DotProdPairsSaturatedMasked", opLen3(ssa.OpDotProdPairsSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.DotProdPairsSaturatedMasked", opLen3(ssa.OpDotProdPairsSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64) @@ -454,42 +457,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) addF(simdPackage, "Float64x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) addF(simdPackage, "Float64x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.FusedMultiplyAddMasked", opLen4(ssa.OpFusedMultiplyAddMaskedFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.FusedMultiplyAddMasked", opLen4(ssa.OpFusedMultiplyAddMaskedFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.FusedMultiplyAddMasked", opLen4(ssa.OpFusedMultiplyAddMaskedFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.FusedMultiplyAddMasked", opLen4(ssa.OpFusedMultiplyAddMaskedFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.FusedMultiplyAddMasked", opLen4(ssa.OpFusedMultiplyAddMaskedFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.FusedMultiplyAddMasked", opLen4(ssa.OpFusedMultiplyAddMaskedFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.FusedMultiplyAddSub", opLen3(ssa.OpFusedMultiplyAddSubFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.FusedMultiplyAddSub", opLen3(ssa.OpFusedMultiplyAddSubFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.FusedMultiplyAddSub", opLen3(ssa.OpFusedMultiplyAddSubFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.FusedMultiplyAddSub", opLen3(ssa.OpFusedMultiplyAddSubFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.FusedMultiplyAddSub", opLen3(ssa.OpFusedMultiplyAddSubFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.FusedMultiplyAddSub", opLen3(ssa.OpFusedMultiplyAddSubFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.FusedMultiplyAddSubMasked", opLen4(ssa.OpFusedMultiplyAddSubMaskedFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.FusedMultiplyAddSubMasked", opLen4(ssa.OpFusedMultiplyAddSubMaskedFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.FusedMultiplyAddSubMasked", opLen4(ssa.OpFusedMultiplyAddSubMaskedFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.FusedMultiplyAddSubMasked", opLen4(ssa.OpFusedMultiplyAddSubMaskedFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.FusedMultiplyAddSubMasked", opLen4(ssa.OpFusedMultiplyAddSubMaskedFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.FusedMultiplyAddSubMasked", opLen4(ssa.OpFusedMultiplyAddSubMaskedFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.FusedMultiplySubAdd", opLen3(ssa.OpFusedMultiplySubAddFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.FusedMultiplySubAdd", opLen3(ssa.OpFusedMultiplySubAddFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.FusedMultiplySubAdd", opLen3(ssa.OpFusedMultiplySubAddFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.FusedMultiplySubAdd", opLen3(ssa.OpFusedMultiplySubAddFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.FusedMultiplySubAdd", opLen3(ssa.OpFusedMultiplySubAddFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.FusedMultiplySubAdd", opLen3(ssa.OpFusedMultiplySubAddFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.FusedMultiplySubAddMasked", opLen4(ssa.OpFusedMultiplySubAddMaskedFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.FusedMultiplySubAddMasked", opLen4(ssa.OpFusedMultiplySubAddMaskedFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.FusedMultiplySubAddMasked", opLen4(ssa.OpFusedMultiplySubAddMaskedFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.FusedMultiplySubAddMasked", opLen4(ssa.OpFusedMultiplySubAddMaskedFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.FusedMultiplySubAddMasked", opLen4(ssa.OpFusedMultiplySubAddMaskedFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.FusedMultiplySubAddMasked", opLen4(ssa.OpFusedMultiplySubAddMaskedFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.GaloisFieldAffineTransform", opLen2Imm8_2I(ssa.OpGaloisFieldAffineTransformUint8x16, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Uint8x32.GaloisFieldAffineTransform", opLen2Imm8_2I(ssa.OpGaloisFieldAffineTransformUint8x32, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Uint8x64.GaloisFieldAffineTransform", opLen2Imm8_2I(ssa.OpGaloisFieldAffineTransformUint8x64, types.TypeVec512, 0), sys.AMD64) @@ -943,34 +910,49 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x2.Mul", opLen2(ssa.OpMulInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x4.Mul", opLen2(ssa.OpMulInt64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int64x8.Mul", opLen2(ssa.OpMulInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.Mul", opLen2(ssa.OpMulUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.Mul", opLen2(ssa.OpMulUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.Mul", opLen2(ssa.OpMulUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.Mul", opLen2(ssa.OpMulUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.Mul", opLen2(ssa.OpMulUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.Mul", opLen2(ssa.OpMulUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.Mul", opLen2(ssa.OpMulUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x4.Mul", opLen2(ssa.OpMulUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.Mul", opLen2(ssa.OpMulUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.MulAdd", opLen3(ssa.OpMulAddFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.MulAdd", opLen3(ssa.OpMulAddFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.MulAdd", opLen3(ssa.OpMulAddFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.MulAdd", opLen3(ssa.OpMulAddFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.MulAdd", opLen3(ssa.OpMulAddFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.MulAdd", opLen3(ssa.OpMulAddFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.MulAddMasked", opLen4(ssa.OpMulAddMaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.MulAddMasked", opLen4(ssa.OpMulAddMaskedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.MulAddMasked", opLen4(ssa.OpMulAddMaskedFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.MulAddMasked", opLen4(ssa.OpMulAddMaskedFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.MulAddMasked", opLen4(ssa.OpMulAddMaskedFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.MulAddMasked", opLen4(ssa.OpMulAddMaskedFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.MulAddSub", opLen3(ssa.OpMulAddSubFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.MulAddSub", opLen3(ssa.OpMulAddSubFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.MulAddSub", opLen3(ssa.OpMulAddSubFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.MulAddSub", opLen3(ssa.OpMulAddSubFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.MulAddSub", opLen3(ssa.OpMulAddSubFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.MulAddSub", opLen3(ssa.OpMulAddSubFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.MulAddSubMasked", opLen4(ssa.OpMulAddSubMaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.MulAddSubMasked", opLen4(ssa.OpMulAddSubMaskedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.MulAddSubMasked", opLen4(ssa.OpMulAddSubMaskedFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.MulAddSubMasked", opLen4(ssa.OpMulAddSubMaskedFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.MulAddSubMasked", opLen4(ssa.OpMulAddSubMaskedFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.MulAddSubMasked", opLen4(ssa.OpMulAddSubMaskedFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x2.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x8.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x4.MulEvenWiden", opLen2(ssa.OpMulEvenWidenUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.MulEvenWiden", opLen2(ssa.OpMulEvenWidenUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint64x2.MulEvenWiden", opLen2(ssa.OpMulEvenWidenUint64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint64x4.MulEvenWiden", opLen2(ssa.OpMulEvenWidenUint64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint64x8.MulEvenWiden", opLen2(ssa.OpMulEvenWidenUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.MulEvenWidenMasked", opLen3(ssa.OpMulEvenWidenMaskedInt64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.MulEvenWidenMasked", opLen3(ssa.OpMulEvenWidenMaskedInt64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x8.MulEvenWidenMasked", opLen3(ssa.OpMulEvenWidenMaskedInt64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.MulEvenWidenMasked", opLen3(ssa.OpMulEvenWidenMaskedUint64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint64x4.MulEvenWidenMasked", opLen3(ssa.OpMulEvenWidenMaskedUint64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint64x8.MulEvenWidenMasked", opLen3(ssa.OpMulEvenWidenMaskedUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.MulHigh", opLen2(ssa.OpMulHighInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.MulHigh", opLen2(ssa.OpMulHighInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x32.MulHigh", opLen2(ssa.OpMulHighInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.MulHigh", opLen2(ssa.OpMulHighUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.MulHigh", opLen2(ssa.OpMulHighUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x32.MulHigh", opLen2(ssa.OpMulHighUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.MulHighMasked", opLen3(ssa.OpMulHighMaskedInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.MulHighMasked", opLen3(ssa.OpMulHighMaskedInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x32.MulHighMasked", opLen3(ssa.OpMulHighMaskedInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x32.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.MulMasked", opLen3(ssa.OpMulMaskedFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.MulMasked", opLen3(ssa.OpMulMaskedFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.MulMasked", opLen3(ssa.OpMulMaskedFloat32x16, types.TypeVec512), sys.AMD64) @@ -986,6 +968,27 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x2.MulMasked", opLen3(ssa.OpMulMaskedInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x4.MulMasked", opLen3(ssa.OpMulMaskedInt64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int64x8.MulMasked", opLen3(ssa.OpMulMaskedInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.MulMasked", opLen3(ssa.OpMulMaskedUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.MulMasked", opLen3(ssa.OpMulMaskedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.MulMasked", opLen3(ssa.OpMulMaskedUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.MulMasked", opLen3(ssa.OpMulMaskedUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.MulMasked", opLen3(ssa.OpMulMaskedUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.MulMasked", opLen3(ssa.OpMulMaskedUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.MulMasked", opLen3(ssa.OpMulMaskedUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x4.MulMasked", opLen3(ssa.OpMulMaskedUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.MulMasked", opLen3(ssa.OpMulMaskedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.MulSubAdd", opLen3(ssa.OpMulSubAddFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.MulSubAdd", opLen3(ssa.OpMulSubAddFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.MulSubAdd", opLen3(ssa.OpMulSubAddFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.MulSubAdd", opLen3(ssa.OpMulSubAddFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.MulSubAdd", opLen3(ssa.OpMulSubAddFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.MulSubAdd", opLen3(ssa.OpMulSubAddFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.MulSubAddMasked", opLen4(ssa.OpMulSubAddMaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.MulSubAddMasked", opLen4(ssa.OpMulSubAddMaskedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.MulSubAddMasked", opLen4(ssa.OpMulSubAddMaskedFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.MulSubAddMasked", opLen4(ssa.OpMulSubAddMaskedFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.MulSubAddMasked", opLen4(ssa.OpMulSubAddMaskedFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.MulSubAddMasked", opLen4(ssa.OpMulSubAddMaskedFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.NotEqual", opLen2(ssa.OpNotEqualFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.NotEqual", opLen2(ssa.OpNotEqualFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.NotEqual", opLen2(ssa.OpNotEqualFloat32x16, types.TypeVec512), sys.AMD64) @@ -1046,6 +1049,54 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.NotEqualMasked", opLen3(ssa.OpNotEqualMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.NotEqualMasked", opLen3(ssa.OpNotEqualMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.NotEqualMasked", opLen3(ssa.OpNotEqualMaskedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.OnesCount", opLen1(ssa.OpOnesCountInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.OnesCount", opLen1(ssa.OpOnesCountInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.OnesCount", opLen1(ssa.OpOnesCountInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.OnesCount", opLen1(ssa.OpOnesCountInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.OnesCount", opLen1(ssa.OpOnesCountInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.OnesCount", opLen1(ssa.OpOnesCountInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.OnesCount", opLen1(ssa.OpOnesCountInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.OnesCount", opLen1(ssa.OpOnesCountInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.OnesCount", opLen1(ssa.OpOnesCountInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.OnesCount", opLen1(ssa.OpOnesCountInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x4.OnesCount", opLen1(ssa.OpOnesCountInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.OnesCount", opLen1(ssa.OpOnesCountInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.OnesCount", opLen1(ssa.OpOnesCountUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.OnesCount", opLen1(ssa.OpOnesCountUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.OnesCount", opLen1(ssa.OpOnesCountUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.OnesCount", opLen1(ssa.OpOnesCountUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.OnesCount", opLen1(ssa.OpOnesCountUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.OnesCount", opLen1(ssa.OpOnesCountUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.OnesCount", opLen1(ssa.OpOnesCountUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.OnesCount", opLen1(ssa.OpOnesCountUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.OnesCount", opLen1(ssa.OpOnesCountUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.OnesCount", opLen1(ssa.OpOnesCountUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x4.OnesCount", opLen1(ssa.OpOnesCountUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.OnesCount", opLen1(ssa.OpOnesCountUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x4.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x4.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.OnesCountMasked", opLen2(ssa.OpOnesCountMaskedUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.Or", opLen2(ssa.OpOrInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Or", opLen2(ssa.OpOrInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.Or", opLen2(ssa.OpOrInt8x64, types.TypeVec512), sys.AMD64) @@ -1082,12 +1133,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.OrMasked", opLen3(ssa.OpOrMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.OrMasked", opLen3(ssa.OpOrMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.OrMasked", opLen3(ssa.OpOrMaskedUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.PairDotProd", opLen2(ssa.OpPairDotProdInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.PairDotProd", opLen2(ssa.OpPairDotProdInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.PairDotProd", opLen2(ssa.OpPairDotProdInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64) @@ -1196,54 +1241,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.PopCount", opLen1(ssa.OpPopCountInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.PopCount", opLen1(ssa.OpPopCountInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.PopCount", opLen1(ssa.OpPopCountInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.PopCount", opLen1(ssa.OpPopCountInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.PopCount", opLen1(ssa.OpPopCountInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.PopCount", opLen1(ssa.OpPopCountInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.PopCount", opLen1(ssa.OpPopCountInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.PopCount", opLen1(ssa.OpPopCountInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.PopCount", opLen1(ssa.OpPopCountInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.PopCount", opLen1(ssa.OpPopCountInt64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.PopCount", opLen1(ssa.OpPopCountInt64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x8.PopCount", opLen1(ssa.OpPopCountInt64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.PopCount", opLen1(ssa.OpPopCountUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.PopCount", opLen1(ssa.OpPopCountUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.PopCount", opLen1(ssa.OpPopCountUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.PopCount", opLen1(ssa.OpPopCountUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.PopCount", opLen1(ssa.OpPopCountUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x32.PopCount", opLen1(ssa.OpPopCountUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x4.PopCount", opLen1(ssa.OpPopCountUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.PopCount", opLen1(ssa.OpPopCountUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x16.PopCount", opLen1(ssa.OpPopCountUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.PopCount", opLen1(ssa.OpPopCountUint64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint64x4.PopCount", opLen1(ssa.OpPopCountUint64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint64x8.PopCount", opLen1(ssa.OpPopCountUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x8.PopCountMasked", opLen2(ssa.OpPopCountMaskedInt64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x32.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x4.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x16.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x2.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint64x4.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint64x8.PopCountMasked", opLen2(ssa.OpPopCountMaskedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.Reciprocal", opLen1(ssa.OpReciprocalFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.Reciprocal", opLen1(ssa.OpReciprocalFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.Reciprocal", opLen1(ssa.OpReciprocalFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ReciprocalMasked", opLen2(ssa.OpReciprocalMaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ReciprocalMasked", opLen2(ssa.OpReciprocalMaskedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ReciprocalMasked", opLen2(ssa.OpReciprocalMaskedFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.ReciprocalMasked", opLen2(ssa.OpReciprocalMaskedFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.ReciprocalMasked", opLen2(ssa.OpReciprocalMaskedFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.ReciprocalMasked", opLen2(ssa.OpReciprocalMaskedFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ReciprocalSqrt", opLen1(ssa.OpReciprocalSqrtFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ReciprocalSqrt", opLen1(ssa.OpReciprocalSqrtFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ReciprocalSqrt", opLen1(ssa.OpReciprocalSqrtFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.ReciprocalSqrt", opLen1(ssa.OpReciprocalSqrtFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.ReciprocalSqrt", opLen1(ssa.OpReciprocalSqrtFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.ReciprocalSqrt", opLen1(ssa.OpReciprocalSqrtFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ReciprocalSqrtMasked", opLen2(ssa.OpReciprocalSqrtMaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ReciprocalSqrtMasked", opLen2(ssa.OpReciprocalSqrtMaskedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ReciprocalSqrtMasked", opLen2(ssa.OpReciprocalSqrtMaskedFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.ReciprocalSqrtMasked", opLen2(ssa.OpReciprocalSqrtMaskedFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.ReciprocalSqrtMasked", opLen2(ssa.OpReciprocalSqrtMaskedFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.ReciprocalSqrtMasked", opLen2(ssa.OpReciprocalSqrtMaskedFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.RotateAllLeft", opLen1Imm8(ssa.OpRotateAllLeftInt32x4, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int32x8.RotateAllLeft", opLen1Imm8(ssa.OpRotateAllLeftInt32x8, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Int32x16.RotateAllLeft", opLen1Imm8(ssa.OpRotateAllLeftInt32x16, types.TypeVec512, 0), sys.AMD64) @@ -1340,52 +1361,34 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.RotateRightMasked", opLen3(ssa.OpRotateRightMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.RotateRightMasked", opLen3(ssa.OpRotateRightMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.RotateRightMasked", opLen3(ssa.OpRotateRightMaskedUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.Round", opLen1(ssa.OpRoundFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.Round", opLen1(ssa.OpRoundFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x2.Round", opLen1(ssa.OpRoundFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.Round", opLen1(ssa.OpRoundFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Int32x4.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.SaturatedUnsignedSignedPairDotProdMasked", opLen3(ssa.OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.SaturatedUnsignedSignedPairDotProdMasked", opLen3(ssa.OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.SaturatedUnsignedSignedPairDotProdMasked", opLen3(ssa.OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.RoundToEven", opLen1(ssa.OpRoundToEvenFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.RoundToEven", opLen1(ssa.OpRoundToEvenFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x2.RoundToEven", opLen1(ssa.OpRoundToEvenFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.RoundToEven", opLen1(ssa.OpRoundToEvenFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x4.RoundToEvenScaled", opLen1Imm8(ssa.OpRoundToEvenScaledFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.RoundToEvenScaled", opLen1Imm8(ssa.OpRoundToEvenScaledFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.RoundToEvenScaled", opLen1Imm8(ssa.OpRoundToEvenScaledFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.RoundToEvenScaled", opLen1Imm8(ssa.OpRoundToEvenScaledFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.RoundToEvenScaled", opLen1Imm8(ssa.OpRoundToEvenScaledFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.RoundToEvenScaled", opLen1Imm8(ssa.OpRoundToEvenScaledFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.RoundToEvenScaledMasked", opLen2Imm8(ssa.OpRoundToEvenScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.RoundToEvenScaledMasked", opLen2Imm8(ssa.OpRoundToEvenScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.RoundToEvenScaledMasked", opLen2Imm8(ssa.OpRoundToEvenScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.RoundToEvenScaledMasked", opLen2Imm8(ssa.OpRoundToEvenScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.RoundToEvenScaledMasked", opLen2Imm8(ssa.OpRoundToEvenScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.RoundToEvenScaledMasked", opLen2Imm8(ssa.OpRoundToEvenScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.RoundToEvenScaledResidueMasked", opLen2Imm8(ssa.OpRoundToEvenScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.RoundToEvenScaledResidueMasked", opLen2Imm8(ssa.OpRoundToEvenScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.RoundToEvenScaledResidueMasked", opLen2Imm8(ssa.OpRoundToEvenScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.RoundToEvenScaledResidueMasked", opLen2Imm8(ssa.OpRoundToEvenScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.RoundToEvenScaledResidueMasked", opLen2Imm8(ssa.OpRoundToEvenScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.RoundToEvenScaledResidueMasked", opLen2Imm8(ssa.OpRoundToEvenScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) addF(simdPackage, "Float32x4.Scale", opLen2(ssa.OpScaleFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Scale", opLen2(ssa.OpScaleFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Scale", opLen2(ssa.OpScaleFloat32x16, types.TypeVec512), sys.AMD64) @@ -1734,12 +1737,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.ShiftRightMasked", opLen3(ssa.OpShiftRightMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.ShiftRightMasked", opLen3(ssa.OpShiftRightMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.ShiftRightMasked", opLen3(ssa.OpShiftRightMaskedUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.Sign", opLen2(ssa.OpSignInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.Sign", opLen2(ssa.OpSignInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x8.Sign", opLen2(ssa.OpSignInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.Sign", opLen2(ssa.OpSignInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x4.Sign", opLen2(ssa.OpSignInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.Sign", opLen2(ssa.OpSignInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x4.Sqrt", opLen1(ssa.OpSqrtFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Sqrt", opLen1(ssa.OpSqrtFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Sqrt", opLen1(ssa.OpSqrtFloat32x16, types.TypeVec512), sys.AMD64) @@ -1878,12 +1875,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) addF(simdPackage, "Float64x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) addF(simdPackage, "Float64x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Int8x16.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.UnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.UnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.Xor", opLen2(ssa.OpXorInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Xor", opLen2(ssa.OpXorInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.Xor", opLen2(ssa.OpXorInt8x64, types.TypeVec512), sys.AMD64) diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 2138271769..712ee70d51 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -4,153 +4,153 @@ package simd -/* Absolute */ +/* Abs */ -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSB, CPU Feature: AVX -func (x Int8x16) Absolute() Int8x16 +func (x Int8x16) Abs() Int8x16 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSB, CPU Feature: AVX2 -func (x Int8x32) Absolute() Int8x32 +func (x Int8x32) Abs() Int8x32 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSB, CPU Feature: AVX512BW -func (x Int8x64) Absolute() Int8x64 +func (x Int8x64) Abs() Int8x64 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSW, CPU Feature: AVX -func (x Int16x8) Absolute() Int16x8 +func (x Int16x8) Abs() Int16x8 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSW, CPU Feature: AVX2 -func (x Int16x16) Absolute() Int16x16 +func (x Int16x16) Abs() Int16x16 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSW, CPU Feature: AVX512BW -func (x Int16x32) Absolute() Int16x32 +func (x Int16x32) Abs() Int16x32 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSD, CPU Feature: AVX -func (x Int32x4) Absolute() Int32x4 +func (x Int32x4) Abs() Int32x4 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSD, CPU Feature: AVX2 -func (x Int32x8) Absolute() Int32x8 +func (x Int32x8) Abs() Int32x8 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSD, CPU Feature: AVX512F -func (x Int32x16) Absolute() Int32x16 +func (x Int32x16) Abs() Int32x16 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSQ, CPU Feature: AVX512F -func (x Int64x2) Absolute() Int64x2 +func (x Int64x2) Abs() Int64x2 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSQ, CPU Feature: AVX512F -func (x Int64x4) Absolute() Int64x4 +func (x Int64x4) Abs() Int64x4 -// Absolute computes the absolute value of each element. +// Abs computes the absolute value of each element. // // Asm: VPABSQ, CPU Feature: AVX512F -func (x Int64x8) Absolute() Int64x8 +func (x Int64x8) Abs() Int64x8 -/* AbsoluteMasked */ +/* AbsMasked */ -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSB, CPU Feature: AVX512BW -func (x Int8x16) AbsoluteMasked(mask Mask8x16) Int8x16 +func (x Int8x16) AbsMasked(mask Mask8x16) Int8x16 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSB, CPU Feature: AVX512BW -func (x Int8x32) AbsoluteMasked(mask Mask8x32) Int8x32 +func (x Int8x32) AbsMasked(mask Mask8x32) Int8x32 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSB, CPU Feature: AVX512BW -func (x Int8x64) AbsoluteMasked(mask Mask8x64) Int8x64 +func (x Int8x64) AbsMasked(mask Mask8x64) Int8x64 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSW, CPU Feature: AVX512BW -func (x Int16x8) AbsoluteMasked(mask Mask16x8) Int16x8 +func (x Int16x8) AbsMasked(mask Mask16x8) Int16x8 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSW, CPU Feature: AVX512BW -func (x Int16x16) AbsoluteMasked(mask Mask16x16) Int16x16 +func (x Int16x16) AbsMasked(mask Mask16x16) Int16x16 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSW, CPU Feature: AVX512BW -func (x Int16x32) AbsoluteMasked(mask Mask16x32) Int16x32 +func (x Int16x32) AbsMasked(mask Mask16x32) Int16x32 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSD, CPU Feature: AVX512F -func (x Int32x4) AbsoluteMasked(mask Mask32x4) Int32x4 +func (x Int32x4) AbsMasked(mask Mask32x4) Int32x4 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSD, CPU Feature: AVX512F -func (x Int32x8) AbsoluteMasked(mask Mask32x8) Int32x8 +func (x Int32x8) AbsMasked(mask Mask32x8) Int32x8 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSD, CPU Feature: AVX512F -func (x Int32x16) AbsoluteMasked(mask Mask32x16) Int32x16 +func (x Int32x16) AbsMasked(mask Mask32x16) Int32x16 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSQ, CPU Feature: AVX512F -func (x Int64x2) AbsoluteMasked(mask Mask64x2) Int64x2 +func (x Int64x2) AbsMasked(mask Mask64x2) Int64x2 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSQ, CPU Feature: AVX512F -func (x Int64x4) AbsoluteMasked(mask Mask64x4) Int64x4 +func (x Int64x4) AbsMasked(mask Mask64x4) Int64x4 -// AbsoluteMasked computes the absolute value of each element. +// AbsMasked computes the absolute value of each element. // // This operation is applied selectively under a write mask. // // Asm: VPABSQ, CPU Feature: AVX512F -func (x Int64x8) AbsoluteMasked(mask Mask64x8) Int64x8 +func (x Int64x8) AbsMasked(mask Mask64x8) Int64x8 /* Add */ @@ -304,45 +304,125 @@ func (x Uint64x4) Add(y Uint64x4) Uint64x4 // Asm: VPADDQ, CPU Feature: AVX512F func (x Uint64x8) Add(y Uint64x8) Uint64x8 -/* AddDotProd */ +/* AddDotProdPairsSaturated */ -// AddDotProd performs dot products on pairs of elements of y and z and then adds x. +// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x. // -// Asm: VPDPWSSD, CPU Feature: AVXVNNI -func (x Int32x4) AddDotProd(y Int16x8, z Int16x8) Int32x4 +// Asm: VPDPWSSDS, CPU Feature: AVXVNNI +func (x Int32x4) AddDotProdPairsSaturated(y Int16x8, z Int16x8) Int32x4 + +// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x. +// +// Asm: VPDPWSSDS, CPU Feature: AVXVNNI +func (x Int32x8) AddDotProdPairsSaturated(y Int16x16, z Int16x16) Int32x8 + +// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI +func (x Int32x16) AddDotProdPairsSaturated(y Int16x32, z Int16x32) Int32x16 + +/* AddDotProdPairsSaturatedMasked */ + +// AddDotProdPairsSaturatedMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI +func (x Int32x4) AddDotProdPairsSaturatedMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4 + +// AddDotProdPairsSaturatedMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI +func (x Int32x8) AddDotProdPairsSaturatedMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8 + +// AddDotProdPairsSaturatedMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI +func (x Int32x16) AddDotProdPairsSaturatedMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16 + +/* AddDotProdQuadruple */ + +// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z. +// +// Asm: VPDPBUSD, CPU Feature: AVXVNNI +func (x Int8x16) AddDotProdQuadruple(y Uint8x16, z Int32x4) Int32x4 + +// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z. +// +// Asm: VPDPBUSD, CPU Feature: AVXVNNI +func (x Int8x32) AddDotProdQuadruple(y Uint8x32, z Int32x8) Int32x8 + +// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z. +// +// Asm: VPDPBUSD, CPU Feature: AVX512VNNI +func (x Int8x64) AddDotProdQuadruple(y Uint8x64, z Int32x16) Int32x16 + +/* AddDotProdQuadrupleMasked */ + +// AddDotProdQuadrupleMasked performs dot products on groups of 4 elements of x and y and then adds z. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPBUSD, CPU Feature: AVX512VNNI +func (x Int8x16) AddDotProdQuadrupleMasked(y Uint8x16, z Int32x4, mask Mask32x4) Int32x4 + +// AddDotProdQuadrupleMasked performs dot products on groups of 4 elements of x and y and then adds z. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPBUSD, CPU Feature: AVX512VNNI +func (x Int8x32) AddDotProdQuadrupleMasked(y Uint8x32, z Int32x8, mask Mask32x8) Int32x8 + +// AddDotProdQuadrupleMasked performs dot products on groups of 4 elements of x and y and then adds z. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPBUSD, CPU Feature: AVX512VNNI +func (x Int8x64) AddDotProdQuadrupleMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16 + +/* AddDotProdQuadrupleSaturated */ + +// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. +// +// Asm: VPDPBUSDS, CPU Feature: AVXVNNI +func (x Int8x16) AddDotProdQuadrupleSaturated(y Uint8x16, z Int32x4) Int32x4 -// AddDotProd performs dot products on pairs of elements of y and z and then adds x. +// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. // -// Asm: VPDPWSSD, CPU Feature: AVXVNNI -func (x Int32x8) AddDotProd(y Int16x16, z Int16x16) Int32x8 +// Asm: VPDPBUSDS, CPU Feature: AVXVNNI +func (x Int8x32) AddDotProdQuadrupleSaturated(y Uint8x32, z Int32x8) Int32x8 -// AddDotProd performs dot products on pairs of elements of y and z and then adds x. +// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. // -// Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int32x16) AddDotProd(y Int16x32, z Int16x32) Int32x16 +// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI +func (x Int8x64) AddDotProdQuadrupleSaturated(y Uint8x64, z Int32x16) Int32x16 -/* AddDotProdMasked */ +/* AddDotProdQuadrupleSaturatedMasked */ -// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// AddDotProdQuadrupleSaturatedMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // This operation is applied selectively under a write mask. // -// Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int32x4) AddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4 +// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI +func (x Int8x16) AddDotProdQuadrupleSaturatedMasked(y Uint8x16, z Int32x4, mask Mask32x4) Int32x4 -// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// AddDotProdQuadrupleSaturatedMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // This operation is applied selectively under a write mask. // -// Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int32x8) AddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8 +// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI +func (x Int8x32) AddDotProdQuadrupleSaturatedMasked(y Uint8x32, z Int32x8, mask Mask32x8) Int32x8 -// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// AddDotProdQuadrupleSaturatedMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // This operation is applied selectively under a write mask. // -// Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int32x16) AddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16 +// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI +func (x Int8x64) AddDotProdQuadrupleSaturatedMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16 /* AddMasked */ @@ -678,32 +758,32 @@ func (x Int16x32) AddSaturated(y Int16x32) Int16x32 // AddSaturated adds corresponding elements of two vectors with saturation. // -// Asm: VPADDSB, CPU Feature: AVX +// Asm: VPADDUSB, CPU Feature: AVX func (x Uint8x16) AddSaturated(y Uint8x16) Uint8x16 // AddSaturated adds corresponding elements of two vectors with saturation. // -// Asm: VPADDSB, CPU Feature: AVX2 +// Asm: VPADDUSB, CPU Feature: AVX2 func (x Uint8x32) AddSaturated(y Uint8x32) Uint8x32 // AddSaturated adds corresponding elements of two vectors with saturation. // -// Asm: VPADDSB, CPU Feature: AVX512BW +// Asm: VPADDUSB, CPU Feature: AVX512BW func (x Uint8x64) AddSaturated(y Uint8x64) Uint8x64 // AddSaturated adds corresponding elements of two vectors with saturation. // -// Asm: VPADDSW, CPU Feature: AVX +// Asm: VPADDUSW, CPU Feature: AVX func (x Uint16x8) AddSaturated(y Uint16x8) Uint16x8 // AddSaturated adds corresponding elements of two vectors with saturation. // -// Asm: VPADDSW, CPU Feature: AVX2 +// Asm: VPADDUSW, CPU Feature: AVX2 func (x Uint16x16) AddSaturated(y Uint16x16) Uint16x16 // AddSaturated adds corresponding elements of two vectors with saturation. // -// Asm: VPADDSW, CPU Feature: AVX512BW +// Asm: VPADDUSW, CPU Feature: AVX512BW func (x Uint16x32) AddSaturated(y Uint16x32) Uint16x32 /* AddSaturatedMasked */ @@ -754,42 +834,42 @@ func (x Int16x32) AddSaturatedMasked(y Int16x32, mask Mask16x32) Int16x32 // // This operation is applied selectively under a write mask. // -// Asm: VPADDSB, CPU Feature: AVX512BW +// Asm: VPADDUSB, CPU Feature: AVX512BW func (x Uint8x16) AddSaturatedMasked(y Uint8x16, mask Mask8x16) Uint8x16 // AddSaturatedMasked adds corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPADDSB, CPU Feature: AVX512BW +// Asm: VPADDUSB, CPU Feature: AVX512BW func (x Uint8x32) AddSaturatedMasked(y Uint8x32, mask Mask8x32) Uint8x32 // AddSaturatedMasked adds corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPADDSB, CPU Feature: AVX512BW +// Asm: VPADDUSB, CPU Feature: AVX512BW func (x Uint8x64) AddSaturatedMasked(y Uint8x64, mask Mask8x64) Uint8x64 // AddSaturatedMasked adds corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPADDSW, CPU Feature: AVX512BW +// Asm: VPADDUSW, CPU Feature: AVX512BW func (x Uint16x8) AddSaturatedMasked(y Uint16x8, mask Mask16x8) Uint16x8 // AddSaturatedMasked adds corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPADDSW, CPU Feature: AVX512BW +// Asm: VPADDUSW, CPU Feature: AVX512BW func (x Uint16x16) AddSaturatedMasked(y Uint16x16, mask Mask16x16) Uint16x16 // AddSaturatedMasked adds corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPADDSW, CPU Feature: AVX512BW +// Asm: VPADDUSW, CPU Feature: AVX512BW func (x Uint16x32) AddSaturatedMasked(y Uint16x32, mask Mask16x32) Uint16x32 /* AddSub */ @@ -1230,158 +1310,6 @@ func (x Uint64x4) AndNotMasked(y Uint64x4, mask Mask64x4) Uint64x4 // Asm: VPANDNQ, CPU Feature: AVX512F func (x Uint64x8) AndNotMasked(y Uint64x8, mask Mask64x8) Uint64x8 -/* ApproximateReciprocal */ - -// ApproximateReciprocal computes an approximate reciprocal of each element. -// -// Asm: VRCPPS, CPU Feature: AVX -func (x Float32x4) ApproximateReciprocal() Float32x4 - -// ApproximateReciprocal computes an approximate reciprocal of each element. -// -// Asm: VRCPPS, CPU Feature: AVX -func (x Float32x8) ApproximateReciprocal() Float32x8 - -// ApproximateReciprocal computes an approximate reciprocal of each element. -// -// Asm: VRCP14PS, CPU Feature: AVX512F -func (x Float32x16) ApproximateReciprocal() Float32x16 - -// ApproximateReciprocal computes an approximate reciprocal of each element. -// -// Asm: VRCP14PD, CPU Feature: AVX512F -func (x Float64x2) ApproximateReciprocal() Float64x2 - -// ApproximateReciprocal computes an approximate reciprocal of each element. -// -// Asm: VRCP14PD, CPU Feature: AVX512F -func (x Float64x4) ApproximateReciprocal() Float64x4 - -// ApproximateReciprocal computes an approximate reciprocal of each element. -// -// Asm: VRCP14PD, CPU Feature: AVX512F -func (x Float64x8) ApproximateReciprocal() Float64x8 - -/* ApproximateReciprocalMasked */ - -// ApproximateReciprocalMasked computes an approximate reciprocal of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRCP14PS, CPU Feature: AVX512F -func (x Float32x4) ApproximateReciprocalMasked(mask Mask32x4) Float32x4 - -// ApproximateReciprocalMasked computes an approximate reciprocal of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRCP14PS, CPU Feature: AVX512F -func (x Float32x8) ApproximateReciprocalMasked(mask Mask32x8) Float32x8 - -// ApproximateReciprocalMasked computes an approximate reciprocal of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRCP14PS, CPU Feature: AVX512F -func (x Float32x16) ApproximateReciprocalMasked(mask Mask32x16) Float32x16 - -// ApproximateReciprocalMasked computes an approximate reciprocal of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRCP14PD, CPU Feature: AVX512F -func (x Float64x2) ApproximateReciprocalMasked(mask Mask64x2) Float64x2 - -// ApproximateReciprocalMasked computes an approximate reciprocal of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRCP14PD, CPU Feature: AVX512F -func (x Float64x4) ApproximateReciprocalMasked(mask Mask64x4) Float64x4 - -// ApproximateReciprocalMasked computes an approximate reciprocal of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRCP14PD, CPU Feature: AVX512F -func (x Float64x8) ApproximateReciprocalMasked(mask Mask64x8) Float64x8 - -/* ApproximateReciprocalOfSqrt */ - -// ApproximateReciprocalOfSqrt computes an approximate reciprocal of the square root of each element. -// -// Asm: VRSQRTPS, CPU Feature: AVX -func (x Float32x4) ApproximateReciprocalOfSqrt() Float32x4 - -// ApproximateReciprocalOfSqrt computes an approximate reciprocal of the square root of each element. -// -// Asm: VRSQRTPS, CPU Feature: AVX -func (x Float32x8) ApproximateReciprocalOfSqrt() Float32x8 - -// ApproximateReciprocalOfSqrt computes an approximate reciprocal of the square root of each element. -// -// Asm: VRSQRT14PS, CPU Feature: AVX512F -func (x Float32x16) ApproximateReciprocalOfSqrt() Float32x16 - -// ApproximateReciprocalOfSqrt computes an approximate reciprocal of the square root of each element. -// -// Asm: VRSQRT14PD, CPU Feature: AVX512F -func (x Float64x2) ApproximateReciprocalOfSqrt() Float64x2 - -// ApproximateReciprocalOfSqrt computes an approximate reciprocal of the square root of each element. -// -// Asm: VRSQRT14PD, CPU Feature: AVX512F -func (x Float64x4) ApproximateReciprocalOfSqrt() Float64x4 - -// ApproximateReciprocalOfSqrt computes an approximate reciprocal of the square root of each element. -// -// Asm: VRSQRT14PD, CPU Feature: AVX512F -func (x Float64x8) ApproximateReciprocalOfSqrt() Float64x8 - -/* ApproximateReciprocalOfSqrtMasked */ - -// ApproximateReciprocalOfSqrtMasked computes an approximate reciprocal of the square root of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRSQRT14PS, CPU Feature: AVX512F -func (x Float32x4) ApproximateReciprocalOfSqrtMasked(mask Mask32x4) Float32x4 - -// ApproximateReciprocalOfSqrtMasked computes an approximate reciprocal of the square root of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRSQRT14PS, CPU Feature: AVX512F -func (x Float32x8) ApproximateReciprocalOfSqrtMasked(mask Mask32x8) Float32x8 - -// ApproximateReciprocalOfSqrtMasked computes an approximate reciprocal of the square root of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRSQRT14PS, CPU Feature: AVX512F -func (x Float32x16) ApproximateReciprocalOfSqrtMasked(mask Mask32x16) Float32x16 - -// ApproximateReciprocalOfSqrtMasked computes an approximate reciprocal of the square root of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRSQRT14PD, CPU Feature: AVX512F -func (x Float64x2) ApproximateReciprocalOfSqrtMasked(mask Mask64x2) Float64x2 - -// ApproximateReciprocalOfSqrtMasked computes an approximate reciprocal of the square root of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRSQRT14PD, CPU Feature: AVX512F -func (x Float64x4) ApproximateReciprocalOfSqrtMasked(mask Mask64x4) Float64x4 - -// ApproximateReciprocalOfSqrtMasked computes an approximate reciprocal of the square root of each element. -// -// This operation is applied selectively under a write mask. -// -// Asm: VRSQRT14PD, CPU Feature: AVX512F -func (x Float64x8) ApproximateReciprocalOfSqrtMasked(mask Mask64x8) Float64x8 - /* Average */ // Average computes the rounded average of corresponding elements. @@ -1942,6 +1870,44 @@ func (x Float32x8) ConvertToUint32Masked(mask Mask32x8) Uint32x8 // Asm: VCVTPS2UDQ, CPU Feature: AVX512F func (x Float32x16) ConvertToUint32Masked(mask Mask32x16) Uint32x16 +/* CopySign */ + +// CopySign returns the product of the first operand with -1, 0, or 1, +// whichever constant is nearest to the value of the second operand. +// +// Asm: VPSIGNB, CPU Feature: AVX +func (x Int8x16) CopySign(y Int8x16) Int8x16 + +// CopySign returns the product of the first operand with -1, 0, or 1, +// whichever constant is nearest to the value of the second operand. +// +// Asm: VPSIGNB, CPU Feature: AVX2 +func (x Int8x32) CopySign(y Int8x32) Int8x32 + +// CopySign returns the product of the first operand with -1, 0, or 1, +// whichever constant is nearest to the value of the second operand. +// +// Asm: VPSIGNW, CPU Feature: AVX +func (x Int16x8) CopySign(y Int16x8) Int16x8 + +// CopySign returns the product of the first operand with -1, 0, or 1, +// whichever constant is nearest to the value of the second operand. +// +// Asm: VPSIGNW, CPU Feature: AVX2 +func (x Int16x16) CopySign(y Int16x16) Int16x16 + +// CopySign returns the product of the first operand with -1, 0, or 1, +// whichever constant is nearest to the value of the second operand. +// +// Asm: VPSIGND, CPU Feature: AVX +func (x Int32x4) CopySign(y Int32x4) Int32x4 + +// CopySign returns the product of the first operand with -1, 0, or 1, +// whichever constant is nearest to the value of the second operand. +// +// Asm: VPSIGND, CPU Feature: AVX2 +func (x Int32x8) CopySign(y Int32x8) Int32x8 + /* Div */ // Div divides elements of two vectors. @@ -2018,22 +1984,97 @@ func (x Float64x4) DivMasked(y Float64x4, mask Mask64x4) Float64x4 // Asm: VDIVPD, CPU Feature: AVX512F func (x Float64x8) DivMasked(y Float64x8, mask Mask64x8) Float64x8 -/* DotProdBroadcast */ +/* DotProdPairs */ + +// DotProdPairs multiplies the elements and add the pairs together, +// yielding a vector of half as many elements with twice the input element size. +// +// Asm: VPMADDWD, CPU Feature: AVX +func (x Int16x8) DotProdPairs(y Int16x8) Int32x4 + +// DotProdPairs multiplies the elements and add the pairs together, +// yielding a vector of half as many elements with twice the input element size. +// +// Asm: VPMADDWD, CPU Feature: AVX2 +func (x Int16x16) DotProdPairs(y Int16x16) Int32x8 + +// DotProdPairs multiplies the elements and add the pairs together, +// yielding a vector of half as many elements with twice the input element size. +// +// Asm: VPMADDWD, CPU Feature: AVX512BW +func (x Int16x32) DotProdPairs(y Int16x32) Int32x16 + +/* DotProdPairsMasked */ + +// DotProdPairsMasked multiplies the elements and add the pairs together, +// yielding a vector of half as many elements with twice the input element size. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMADDWD, CPU Feature: AVX512BW +func (x Int16x8) DotProdPairsMasked(y Int16x8, mask Mask16x8) Int32x4 + +// DotProdPairsMasked multiplies the elements and add the pairs together, +// yielding a vector of half as many elements with twice the input element size. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMADDWD, CPU Feature: AVX512BW +func (x Int16x16) DotProdPairsMasked(y Int16x16, mask Mask16x16) Int32x8 + +// DotProdPairsMasked multiplies the elements and add the pairs together, +// yielding a vector of half as many elements with twice the input element size. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMADDWD, CPU Feature: AVX512BW +func (x Int16x32) DotProdPairsMasked(y Int16x32, mask Mask16x32) Int32x16 + +/* DotProdPairsSaturated */ + +// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, +// yielding a vector of half as many elements with twice the input element size. +// +// Asm: VPMADDUBSW, CPU Feature: AVX +func (x Uint8x16) DotProdPairsSaturated(y Int8x16) Int16x8 + +// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, +// yielding a vector of half as many elements with twice the input element size. +// +// Asm: VPMADDUBSW, CPU Feature: AVX2 +func (x Uint8x32) DotProdPairsSaturated(y Int8x32) Int16x16 + +// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, +// yielding a vector of half as many elements with twice the input element size. +// +// Asm: VPMADDUBSW, CPU Feature: AVX512BW +func (x Uint8x64) DotProdPairsSaturated(y Int8x64) Int16x32 + +/* DotProdPairsSaturatedMasked */ -// DotProdBroadcast multiplies all elements and broadcasts the sum. +// DotProdPairsSaturatedMasked multiplies the elements and add the pairs together with saturation, +// yielding a vector of half as many elements with twice the input element size. +// +// This operation is applied selectively under a write mask. // -// Asm: VDPPS, CPU Feature: AVX -func (x Float32x4) DotProdBroadcast(y Float32x4) Float32x4 +// Asm: VPMADDUBSW, CPU Feature: AVX512BW +func (x Uint8x16) DotProdPairsSaturatedMasked(y Int8x16, mask Mask16x8) Int16x8 -// DotProdBroadcast multiplies all elements and broadcasts the sum. +// DotProdPairsSaturatedMasked multiplies the elements and add the pairs together with saturation, +// yielding a vector of half as many elements with twice the input element size. +// +// This operation is applied selectively under a write mask. // -// Asm: VDPPS, CPU Feature: AVX -func (x Float32x8) DotProdBroadcast(y Float32x8) Float32x8 +// Asm: VPMADDUBSW, CPU Feature: AVX512BW +func (x Uint8x32) DotProdPairsSaturatedMasked(y Int8x32, mask Mask16x16) Int16x16 -// DotProdBroadcast multiplies all elements and broadcasts the sum. +// DotProdPairsSaturatedMasked multiplies the elements and add the pairs together with saturation, +// yielding a vector of half as many elements with twice the input element size. +// +// This operation is applied selectively under a write mask. // -// Asm: VDPPD, CPU Feature: AVX -func (x Float64x2) DotProdBroadcast(y Float64x2) Float64x2 +// Asm: VPMADDUBSW, CPU Feature: AVX512BW +func (x Uint8x64) DotProdPairsSaturatedMasked(y Int8x64, mask Mask16x32) Int16x32 /* Equal */ @@ -2803,235 +2844,7 @@ func (x Float64x4) FloorScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4 // Asm: VREDUCEPD, CPU Feature: AVX512DQ func (x Float64x8) FloorScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8 -/* FusedMultiplyAdd */ - -// FusedMultiplyAdd performs (x * y) + z. -// -// Asm: VFMADD213PS, CPU Feature: AVX512F -func (x Float32x4) FusedMultiplyAdd(y Float32x4, z Float32x4) Float32x4 - -// FusedMultiplyAdd performs (x * y) + z. -// -// Asm: VFMADD213PS, CPU Feature: AVX512F -func (x Float32x8) FusedMultiplyAdd(y Float32x8, z Float32x8) Float32x8 - -// FusedMultiplyAdd performs (x * y) + z. -// -// Asm: VFMADD213PS, CPU Feature: AVX512F -func (x Float32x16) FusedMultiplyAdd(y Float32x16, z Float32x16) Float32x16 - -// FusedMultiplyAdd performs (x * y) + z. -// -// Asm: VFMADD213PD, CPU Feature: AVX512F -func (x Float64x2) FusedMultiplyAdd(y Float64x2, z Float64x2) Float64x2 - -// FusedMultiplyAdd performs (x * y) + z. -// -// Asm: VFMADD213PD, CPU Feature: AVX512F -func (x Float64x4) FusedMultiplyAdd(y Float64x4, z Float64x4) Float64x4 - -// FusedMultiplyAdd performs (x * y) + z. -// -// Asm: VFMADD213PD, CPU Feature: AVX512F -func (x Float64x8) FusedMultiplyAdd(y Float64x8, z Float64x8) Float64x8 - -/* FusedMultiplyAddMasked */ - -// FusedMultiplyAddMasked performs (x * y) + z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADD213PS, CPU Feature: AVX512F -func (x Float32x4) FusedMultiplyAddMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 - -// FusedMultiplyAddMasked performs (x * y) + z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADD213PS, CPU Feature: AVX512F -func (x Float32x8) FusedMultiplyAddMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 - -// FusedMultiplyAddMasked performs (x * y) + z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADD213PS, CPU Feature: AVX512F -func (x Float32x16) FusedMultiplyAddMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 - -// FusedMultiplyAddMasked performs (x * y) + z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADD213PD, CPU Feature: AVX512F -func (x Float64x2) FusedMultiplyAddMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 - -// FusedMultiplyAddMasked performs (x * y) + z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADD213PD, CPU Feature: AVX512F -func (x Float64x4) FusedMultiplyAddMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 - -// FusedMultiplyAddMasked performs (x * y) + z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADD213PD, CPU Feature: AVX512F -func (x Float64x8) FusedMultiplyAddMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 - -/* FusedMultiplyAddSub */ - -// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// Asm: VFMADDSUB213PS, CPU Feature: AVX512F -func (x Float32x4) FusedMultiplyAddSub(y Float32x4, z Float32x4) Float32x4 - -// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// Asm: VFMADDSUB213PS, CPU Feature: AVX512F -func (x Float32x8) FusedMultiplyAddSub(y Float32x8, z Float32x8) Float32x8 - -// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// Asm: VFMADDSUB213PS, CPU Feature: AVX512F -func (x Float32x16) FusedMultiplyAddSub(y Float32x16, z Float32x16) Float32x16 - -// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// Asm: VFMADDSUB213PD, CPU Feature: AVX512F -func (x Float64x2) FusedMultiplyAddSub(y Float64x2, z Float64x2) Float64x2 - -// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// Asm: VFMADDSUB213PD, CPU Feature: AVX512F -func (x Float64x4) FusedMultiplyAddSub(y Float64x4, z Float64x4) Float64x4 - -// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// Asm: VFMADDSUB213PD, CPU Feature: AVX512F -func (x Float64x8) FusedMultiplyAddSub(y Float64x8, z Float64x8) Float64x8 - -/* FusedMultiplyAddSubMasked */ - -// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADDSUB213PS, CPU Feature: AVX512F -func (x Float32x4) FusedMultiplyAddSubMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 - -// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADDSUB213PS, CPU Feature: AVX512F -func (x Float32x8) FusedMultiplyAddSubMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 - -// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADDSUB213PS, CPU Feature: AVX512F -func (x Float32x16) FusedMultiplyAddSubMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 - -// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADDSUB213PD, CPU Feature: AVX512F -func (x Float64x2) FusedMultiplyAddSubMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 - -// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADDSUB213PD, CPU Feature: AVX512F -func (x Float64x4) FusedMultiplyAddSubMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 - -// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMADDSUB213PD, CPU Feature: AVX512F -func (x Float64x8) FusedMultiplyAddSubMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 - -/* FusedMultiplySubAdd */ - -// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// Asm: VFMSUBADD213PS, CPU Feature: AVX512F -func (x Float32x4) FusedMultiplySubAdd(y Float32x4, z Float32x4) Float32x4 - -// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// Asm: VFMSUBADD213PS, CPU Feature: AVX512F -func (x Float32x8) FusedMultiplySubAdd(y Float32x8, z Float32x8) Float32x8 - -// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// Asm: VFMSUBADD213PS, CPU Feature: AVX512F -func (x Float32x16) FusedMultiplySubAdd(y Float32x16, z Float32x16) Float32x16 - -// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// Asm: VFMSUBADD213PD, CPU Feature: AVX512F -func (x Float64x2) FusedMultiplySubAdd(y Float64x2, z Float64x2) Float64x2 - -// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// Asm: VFMSUBADD213PD, CPU Feature: AVX512F -func (x Float64x4) FusedMultiplySubAdd(y Float64x4, z Float64x4) Float64x4 - -// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// Asm: VFMSUBADD213PD, CPU Feature: AVX512F -func (x Float64x8) FusedMultiplySubAdd(y Float64x8, z Float64x8) Float64x8 - -/* FusedMultiplySubAddMasked */ - -// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMSUBADD213PS, CPU Feature: AVX512F -func (x Float32x4) FusedMultiplySubAddMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 - -// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMSUBADD213PS, CPU Feature: AVX512F -func (x Float32x8) FusedMultiplySubAddMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 - -// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMSUBADD213PS, CPU Feature: AVX512F -func (x Float32x16) FusedMultiplySubAddMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 - -// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMSUBADD213PD, CPU Feature: AVX512F -func (x Float64x2) FusedMultiplySubAddMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 - -// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMSUBADD213PD, CPU Feature: AVX512F -func (x Float64x4) FusedMultiplySubAddMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 - -// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VFMSUBADD213PD, CPU Feature: AVX512F -func (x Float64x8) FusedMultiplySubAddMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 - -/* GaloisFieldAffineTransform */ +/* GaloisFieldAffineTransform */ // GaloisFieldAffineTransform computes an affine transformation in GF(2^8): // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes; @@ -5822,193 +5635,268 @@ func (x Int64x4) Mul(y Int64x4) Int64x4 // Asm: VPMULLQ, CPU Feature: AVX512DQ func (x Int64x8) Mul(y Int64x8) Int64x8 -/* MulEvenWiden */ +// Mul multiplies corresponding elements of two vectors. +// +// Asm: VPMULLW, CPU Feature: AVX +func (x Uint16x8) Mul(y Uint16x8) Uint16x8 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VPMULDQ, CPU Feature: AVX -func (x Int32x4) MulEvenWiden(y Int32x4) Int64x2 +// Asm: VPMULLW, CPU Feature: AVX2 +func (x Uint16x16) Mul(y Uint16x16) Uint16x16 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VPMULDQ, CPU Feature: AVX2 -func (x Int32x8) MulEvenWiden(y Int32x8) Int64x4 +// Asm: VPMULLW, CPU Feature: AVX512BW +func (x Uint16x32) Mul(y Uint16x32) Uint16x32 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VPMULDQ, CPU Feature: AVX512F -func (x Int64x2) MulEvenWiden(y Int64x2) Int64x2 +// Asm: VPMULLD, CPU Feature: AVX +func (x Uint32x4) Mul(y Uint32x4) Uint32x4 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VPMULDQ, CPU Feature: AVX512F -func (x Int64x4) MulEvenWiden(y Int64x4) Int64x4 +// Asm: VPMULLD, CPU Feature: AVX2 +func (x Uint32x8) Mul(y Uint32x8) Uint32x8 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VPMULDQ, CPU Feature: AVX512F -func (x Int64x8) MulEvenWiden(y Int64x8) Int64x8 +// Asm: VPMULLD, CPU Feature: AVX512F +func (x Uint32x16) Mul(y Uint32x16) Uint32x16 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VPMULUDQ, CPU Feature: AVX -func (x Uint32x4) MulEvenWiden(y Uint32x4) Uint64x2 +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Uint64x2) Mul(y Uint64x2) Uint64x2 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VPMULUDQ, CPU Feature: AVX2 -func (x Uint32x8) MulEvenWiden(y Uint32x8) Uint64x4 +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Uint64x4) Mul(y Uint64x4) Uint64x4 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// Mul multiplies corresponding elements of two vectors. +// +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Uint64x8) Mul(y Uint64x8) Uint64x8 + +/* MulAdd */ + +// MulAdd performs a fused (x * y) + z. // -// Asm: VPMULUDQ, CPU Feature: AVX512F -func (x Uint64x2) MulEvenWiden(y Uint64x2) Uint64x2 +// Asm: VFMADD213PS, CPU Feature: AVX512F +func (x Float32x4) MulAdd(y Float32x4, z Float32x4) Float32x4 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// MulAdd performs a fused (x * y) + z. // -// Asm: VPMULUDQ, CPU Feature: AVX512F -func (x Uint64x4) MulEvenWiden(y Uint64x4) Uint64x4 +// Asm: VFMADD213PS, CPU Feature: AVX512F +func (x Float32x8) MulAdd(y Float32x8, z Float32x8) Float32x8 -// MulEvenWiden multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// MulAdd performs a fused (x * y) + z. +// +// Asm: VFMADD213PS, CPU Feature: AVX512F +func (x Float32x16) MulAdd(y Float32x16, z Float32x16) Float32x16 + +// MulAdd performs a fused (x * y) + z. // -// Asm: VPMULUDQ, CPU Feature: AVX512F -func (x Uint64x8) MulEvenWiden(y Uint64x8) Uint64x8 +// Asm: VFMADD213PD, CPU Feature: AVX512F +func (x Float64x2) MulAdd(y Float64x2, z Float64x2) Float64x2 -/* MulEvenWidenMasked */ +// MulAdd performs a fused (x * y) + z. +// +// Asm: VFMADD213PD, CPU Feature: AVX512F +func (x Float64x4) MulAdd(y Float64x4, z Float64x4) Float64x4 -// MulEvenWidenMasked multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// MulAdd performs a fused (x * y) + z. +// +// Asm: VFMADD213PD, CPU Feature: AVX512F +func (x Float64x8) MulAdd(y Float64x8, z Float64x8) Float64x8 + +/* MulAddMasked */ + +// MulAddMasked performs a fused (x * y) + z. // // This operation is applied selectively under a write mask. // -// Asm: VPMULDQ, CPU Feature: AVX512F -func (x Int64x2) MulEvenWidenMasked(y Int64x2, mask Mask64x2) Int64x2 +// Asm: VFMADD213PS, CPU Feature: AVX512F +func (x Float32x4) MulAddMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 -// MulEvenWidenMasked multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// MulAddMasked performs a fused (x * y) + z. // // This operation is applied selectively under a write mask. // -// Asm: VPMULDQ, CPU Feature: AVX512F -func (x Int64x4) MulEvenWidenMasked(y Int64x4, mask Mask64x4) Int64x4 +// Asm: VFMADD213PS, CPU Feature: AVX512F +func (x Float32x8) MulAddMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 -// MulEvenWidenMasked multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// MulAddMasked performs a fused (x * y) + z. // // This operation is applied selectively under a write mask. // -// Asm: VPMULDQ, CPU Feature: AVX512F -func (x Int64x8) MulEvenWidenMasked(y Int64x8, mask Mask64x8) Int64x8 +// Asm: VFMADD213PS, CPU Feature: AVX512F +func (x Float32x16) MulAddMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 -// MulEvenWidenMasked multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// MulAddMasked performs a fused (x * y) + z. // // This operation is applied selectively under a write mask. // -// Asm: VPMULUDQ, CPU Feature: AVX512F -func (x Uint64x2) MulEvenWidenMasked(y Uint64x2, mask Mask64x2) Uint64x2 +// Asm: VFMADD213PD, CPU Feature: AVX512F +func (x Float64x2) MulAddMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 -// MulEvenWidenMasked multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// MulAddMasked performs a fused (x * y) + z. // // This operation is applied selectively under a write mask. // -// Asm: VPMULUDQ, CPU Feature: AVX512F -func (x Uint64x4) MulEvenWidenMasked(y Uint64x4, mask Mask64x4) Uint64x4 +// Asm: VFMADD213PD, CPU Feature: AVX512F +func (x Float64x4) MulAddMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 -// MulEvenWidenMasked multiplies even-indexed elements, widening the result. -// Result[i] = v1.Even[i] * v2.Even[i]. +// MulAddMasked performs a fused (x * y) + z. // // This operation is applied selectively under a write mask. // -// Asm: VPMULUDQ, CPU Feature: AVX512F -func (x Uint64x8) MulEvenWidenMasked(y Uint64x8, mask Mask64x8) Uint64x8 +// Asm: VFMADD213PD, CPU Feature: AVX512F +func (x Float64x8) MulAddMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 -/* MulHigh */ +/* MulAddSub */ -// MulHigh multiplies elements and stores the high part of the result. +// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // -// Asm: VPMULHW, CPU Feature: AVX -func (x Int16x8) MulHigh(y Int16x8) Int16x8 +// Asm: VFMADDSUB213PS, CPU Feature: AVX512F +func (x Float32x4) MulAddSub(y Float32x4, z Float32x4) Float32x4 -// MulHigh multiplies elements and stores the high part of the result. +// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // -// Asm: VPMULHW, CPU Feature: AVX2 -func (x Int16x16) MulHigh(y Int16x16) Int16x16 +// Asm: VFMADDSUB213PS, CPU Feature: AVX512F +func (x Float32x8) MulAddSub(y Float32x8, z Float32x8) Float32x8 -// MulHigh multiplies elements and stores the high part of the result. +// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // -// Asm: VPMULHW, CPU Feature: AVX512BW -func (x Int16x32) MulHigh(y Int16x32) Int16x32 +// Asm: VFMADDSUB213PS, CPU Feature: AVX512F +func (x Float32x16) MulAddSub(y Float32x16, z Float32x16) Float32x16 -// MulHigh multiplies elements and stores the high part of the result. +// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // -// Asm: VPMULHUW, CPU Feature: AVX -func (x Uint16x8) MulHigh(y Uint16x8) Uint16x8 +// Asm: VFMADDSUB213PD, CPU Feature: AVX512F +func (x Float64x2) MulAddSub(y Float64x2, z Float64x2) Float64x2 -// MulHigh multiplies elements and stores the high part of the result. +// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // -// Asm: VPMULHUW, CPU Feature: AVX2 -func (x Uint16x16) MulHigh(y Uint16x16) Uint16x16 +// Asm: VFMADDSUB213PD, CPU Feature: AVX512F +func (x Float64x4) MulAddSub(y Float64x4, z Float64x4) Float64x4 -// MulHigh multiplies elements and stores the high part of the result. +// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // -// Asm: VPMULHUW, CPU Feature: AVX512BW -func (x Uint16x32) MulHigh(y Uint16x32) Uint16x32 +// Asm: VFMADDSUB213PD, CPU Feature: AVX512F +func (x Float64x8) MulAddSub(y Float64x8, z Float64x8) Float64x8 -/* MulHighMasked */ +/* MulAddSubMasked */ -// MulHighMasked multiplies elements and stores the high part of the result. +// MulAddSubMasked performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // This operation is applied selectively under a write mask. // -// Asm: VPMULHW, CPU Feature: AVX512BW -func (x Int16x8) MulHighMasked(y Int16x8, mask Mask16x8) Int16x8 +// Asm: VFMADDSUB213PS, CPU Feature: AVX512F +func (x Float32x4) MulAddSubMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 -// MulHighMasked multiplies elements and stores the high part of the result. +// MulAddSubMasked performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // This operation is applied selectively under a write mask. // -// Asm: VPMULHW, CPU Feature: AVX512BW -func (x Int16x16) MulHighMasked(y Int16x16, mask Mask16x16) Int16x16 +// Asm: VFMADDSUB213PS, CPU Feature: AVX512F +func (x Float32x8) MulAddSubMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 -// MulHighMasked multiplies elements and stores the high part of the result. +// MulAddSubMasked performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // This operation is applied selectively under a write mask. // -// Asm: VPMULHW, CPU Feature: AVX512BW -func (x Int16x32) MulHighMasked(y Int16x32, mask Mask16x32) Int16x32 +// Asm: VFMADDSUB213PS, CPU Feature: AVX512F +func (x Float32x16) MulAddSubMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 + +// MulAddSubMasked performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMADDSUB213PD, CPU Feature: AVX512F +func (x Float64x2) MulAddSubMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 + +// MulAddSubMasked performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMADDSUB213PD, CPU Feature: AVX512F +func (x Float64x4) MulAddSubMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 + +// MulAddSubMasked performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMADDSUB213PD, CPU Feature: AVX512F +func (x Float64x8) MulAddSubMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 + +/* MulEvenWiden */ + +// MulEvenWiden multiplies even-indexed elements, widening the result. +// Result[i] = v1.Even[i] * v2.Even[i]. +// +// Asm: VPMULDQ, CPU Feature: AVX +func (x Int32x4) MulEvenWiden(y Int32x4) Int64x2 + +// MulEvenWiden multiplies even-indexed elements, widening the result. +// Result[i] = v1.Even[i] * v2.Even[i]. +// +// Asm: VPMULDQ, CPU Feature: AVX2 +func (x Int32x8) MulEvenWiden(y Int32x8) Int64x4 + +// MulEvenWiden multiplies even-indexed elements, widening the result. +// Result[i] = v1.Even[i] * v2.Even[i]. +// +// Asm: VPMULUDQ, CPU Feature: AVX +func (x Uint32x4) MulEvenWiden(y Uint32x4) Uint64x2 + +// MulEvenWiden multiplies even-indexed elements, widening the result. +// Result[i] = v1.Even[i] * v2.Even[i]. +// +// Asm: VPMULUDQ, CPU Feature: AVX2 +func (x Uint32x8) MulEvenWiden(y Uint32x8) Uint64x4 + +/* MulHigh */ + +// MulHigh multiplies elements and stores the high part of the result. +// +// Asm: VPMULHUW, CPU Feature: AVX +func (x Int16x8) MulHigh(y Int16x8) Int16x8 + +// MulHigh multiplies elements and stores the high part of the result. +// +// Asm: VPMULHUW, CPU Feature: AVX2 +func (x Int16x16) MulHigh(y Int16x16) Int16x16 + +// MulHigh multiplies elements and stores the high part of the result. +// +// Asm: VPMULHW, CPU Feature: AVX512BW +func (x Int16x32) MulHigh(y Int16x32) Int16x32 + +/* MulHighMasked */ // MulHighMasked multiplies elements and stores the high part of the result. // // This operation is applied selectively under a write mask. // // Asm: VPMULHUW, CPU Feature: AVX512BW -func (x Uint16x8) MulHighMasked(y Uint16x8, mask Mask16x8) Uint16x8 +func (x Int16x8) MulHighMasked(y Int16x8, mask Mask16x8) Int16x8 // MulHighMasked multiplies elements and stores the high part of the result. // // This operation is applied selectively under a write mask. // -// Asm: VPMULHUW, CPU Feature: AVX512BW -func (x Uint16x16) MulHighMasked(y Uint16x16, mask Mask16x16) Uint16x16 +// Asm: VPMULHW, CPU Feature: AVX512BW +func (x Int16x16) MulHighMasked(y Int16x16, mask Mask16x16) Int16x16 // MulHighMasked multiplies elements and stores the high part of the result. // // This operation is applied selectively under a write mask. // // Asm: VPMULHUW, CPU Feature: AVX512BW -func (x Uint16x32) MulHighMasked(y Uint16x32, mask Mask16x32) Uint16x32 +func (x Int16x32) MulHighMasked(y Int16x32, mask Mask16x32) Int16x32 /* MulMasked */ @@ -6117,6 +6005,145 @@ func (x Int64x4) MulMasked(y Int64x4, mask Mask64x4) Int64x4 // Asm: VPMULLQ, CPU Feature: AVX512DQ func (x Int64x8) MulMasked(y Int64x8, mask Mask64x8) Int64x8 +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLW, CPU Feature: AVX512BW +func (x Uint16x8) MulMasked(y Uint16x8, mask Mask16x8) Uint16x8 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLW, CPU Feature: AVX512BW +func (x Uint16x16) MulMasked(y Uint16x16, mask Mask16x16) Uint16x16 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLW, CPU Feature: AVX512BW +func (x Uint16x32) MulMasked(y Uint16x32, mask Mask16x32) Uint16x32 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLD, CPU Feature: AVX512F +func (x Uint32x4) MulMasked(y Uint32x4, mask Mask32x4) Uint32x4 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLD, CPU Feature: AVX512F +func (x Uint32x8) MulMasked(y Uint32x8, mask Mask32x8) Uint32x8 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLD, CPU Feature: AVX512F +func (x Uint32x16) MulMasked(y Uint32x16, mask Mask32x16) Uint32x16 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Uint64x2) MulMasked(y Uint64x2, mask Mask64x2) Uint64x2 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Uint64x4) MulMasked(y Uint64x4, mask Mask64x4) Uint64x4 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Uint64x8) MulMasked(y Uint64x8, mask Mask64x8) Uint64x8 + +/* MulSubAdd */ + +// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// Asm: VFMSUBADD213PS, CPU Feature: AVX512F +func (x Float32x4) MulSubAdd(y Float32x4, z Float32x4) Float32x4 + +// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// Asm: VFMSUBADD213PS, CPU Feature: AVX512F +func (x Float32x8) MulSubAdd(y Float32x8, z Float32x8) Float32x8 + +// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// Asm: VFMSUBADD213PS, CPU Feature: AVX512F +func (x Float32x16) MulSubAdd(y Float32x16, z Float32x16) Float32x16 + +// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// Asm: VFMSUBADD213PD, CPU Feature: AVX512F +func (x Float64x2) MulSubAdd(y Float64x2, z Float64x2) Float64x2 + +// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// Asm: VFMSUBADD213PD, CPU Feature: AVX512F +func (x Float64x4) MulSubAdd(y Float64x4, z Float64x4) Float64x4 + +// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// Asm: VFMSUBADD213PD, CPU Feature: AVX512F +func (x Float64x8) MulSubAdd(y Float64x8, z Float64x8) Float64x8 + +/* MulSubAddMasked */ + +// MulSubAddMasked performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMSUBADD213PS, CPU Feature: AVX512F +func (x Float32x4) MulSubAddMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 + +// MulSubAddMasked performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMSUBADD213PS, CPU Feature: AVX512F +func (x Float32x8) MulSubAddMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 + +// MulSubAddMasked performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMSUBADD213PS, CPU Feature: AVX512F +func (x Float32x16) MulSubAddMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 + +// MulSubAddMasked performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMSUBADD213PD, CPU Feature: AVX512F +func (x Float64x2) MulSubAddMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 + +// MulSubAddMasked performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMSUBADD213PD, CPU Feature: AVX512F +func (x Float64x4) MulSubAddMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 + +// MulSubAddMasked performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. +// +// This operation is applied selectively under a write mask. +// +// Asm: VFMSUBADD213PD, CPU Feature: AVX512F +func (x Float64x8) MulSubAddMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 + /* NotEqual */ // NotEqual compares for inequality. @@ -6324,162 +6351,454 @@ func (x Int8x16) NotEqualMasked(y Int8x16, mask Mask8x16) Mask8x16 // // This operation is applied selectively under a write mask. // -// Asm: VPCMPB, CPU Feature: AVX512BW -func (x Int8x32) NotEqualMasked(y Int8x32, mask Mask8x32) Mask8x32 +// Asm: VPCMPB, CPU Feature: AVX512BW +func (x Int8x32) NotEqualMasked(y Int8x32, mask Mask8x32) Mask8x32 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPB, CPU Feature: AVX512BW +func (x Int8x64) NotEqualMasked(y Int8x64, mask Mask8x64) Mask8x64 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPW, CPU Feature: AVX512BW +func (x Int16x8) NotEqualMasked(y Int16x8, mask Mask16x8) Mask16x8 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPW, CPU Feature: AVX512BW +func (x Int16x16) NotEqualMasked(y Int16x16, mask Mask16x16) Mask16x16 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPW, CPU Feature: AVX512BW +func (x Int16x32) NotEqualMasked(y Int16x32, mask Mask16x32) Mask16x32 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPD, CPU Feature: AVX512F +func (x Int32x4) NotEqualMasked(y Int32x4, mask Mask32x4) Mask32x4 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPD, CPU Feature: AVX512F +func (x Int32x8) NotEqualMasked(y Int32x8, mask Mask32x8) Mask32x8 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPD, CPU Feature: AVX512F +func (x Int32x16) NotEqualMasked(y Int32x16, mask Mask32x16) Mask32x16 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPQ, CPU Feature: AVX512F +func (x Int64x2) NotEqualMasked(y Int64x2, mask Mask64x2) Mask64x2 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPQ, CPU Feature: AVX512F +func (x Int64x4) NotEqualMasked(y Int64x4, mask Mask64x4) Mask64x4 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPQ, CPU Feature: AVX512F +func (x Int64x8) NotEqualMasked(y Int64x8, mask Mask64x8) Mask64x8 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUB, CPU Feature: AVX512BW +func (x Uint8x16) NotEqualMasked(y Uint8x16, mask Mask8x16) Mask8x16 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUB, CPU Feature: AVX512BW +func (x Uint8x32) NotEqualMasked(y Uint8x32, mask Mask8x32) Mask8x32 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUB, CPU Feature: AVX512BW +func (x Uint8x64) NotEqualMasked(y Uint8x64, mask Mask8x64) Mask8x64 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUW, CPU Feature: AVX512BW +func (x Uint16x8) NotEqualMasked(y Uint16x8, mask Mask16x8) Mask16x8 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUW, CPU Feature: AVX512BW +func (x Uint16x16) NotEqualMasked(y Uint16x16, mask Mask16x16) Mask16x16 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUW, CPU Feature: AVX512BW +func (x Uint16x32) NotEqualMasked(y Uint16x32, mask Mask16x32) Mask16x32 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUD, CPU Feature: AVX512F +func (x Uint32x4) NotEqualMasked(y Uint32x4, mask Mask32x4) Mask32x4 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUD, CPU Feature: AVX512F +func (x Uint32x8) NotEqualMasked(y Uint32x8, mask Mask32x8) Mask32x8 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUD, CPU Feature: AVX512F +func (x Uint32x16) NotEqualMasked(y Uint32x16, mask Mask32x16) Mask32x16 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUQ, CPU Feature: AVX512F +func (x Uint64x2) NotEqualMasked(y Uint64x2, mask Mask64x2) Mask64x2 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUQ, CPU Feature: AVX512F +func (x Uint64x4) NotEqualMasked(y Uint64x4, mask Mask64x4) Mask64x4 + +// NotEqualMasked compares for inequality. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPCMPUQ, CPU Feature: AVX512F +func (x Uint64x8) NotEqualMasked(y Uint64x8, mask Mask64x8) Mask64x8 + +/* OnesCount */ + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Int8x16) OnesCount() Int8x16 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Int8x32) OnesCount() Int8x32 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Int8x64) OnesCount() Int8x64 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Int16x8) OnesCount() Int16x8 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Int16x16) OnesCount() Int16x16 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Int16x32) OnesCount() Int16x32 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Int32x4) OnesCount() Int32x4 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Int32x8) OnesCount() Int32x8 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Int32x16) OnesCount() Int32x16 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Int64x2) OnesCount() Int64x2 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Int64x4) OnesCount() Int64x4 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Int64x8) OnesCount() Int64x8 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Uint8x16) OnesCount() Uint8x16 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Uint8x32) OnesCount() Uint8x32 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Uint8x64) OnesCount() Uint8x64 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Uint16x8) OnesCount() Uint16x8 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Uint16x16) OnesCount() Uint16x16 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Uint16x32) OnesCount() Uint16x32 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Uint32x4) OnesCount() Uint32x4 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Uint32x8) OnesCount() Uint32x8 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Uint32x16) OnesCount() Uint32x16 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Uint64x2) OnesCount() Uint64x2 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Uint64x4) OnesCount() Uint64x4 + +// OnesCount counts the number of set bits in each element. +// +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Uint64x8) OnesCount() Uint64x8 + +/* OnesCountMasked */ + +// OnesCountMasked counts the number of set bits in each element. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Int8x16) OnesCountMasked(mask Mask8x16) Int8x16 + +// OnesCountMasked counts the number of set bits in each element. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Int8x32) OnesCountMasked(mask Mask8x32) Int8x32 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPB, CPU Feature: AVX512BW -func (x Int8x64) NotEqualMasked(y Int8x64, mask Mask8x64) Mask8x64 +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Int8x64) OnesCountMasked(mask Mask8x64) Int8x64 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPW, CPU Feature: AVX512BW -func (x Int16x8) NotEqualMasked(y Int16x8, mask Mask16x8) Mask16x8 +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Int16x8) OnesCountMasked(mask Mask16x8) Int16x8 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPW, CPU Feature: AVX512BW -func (x Int16x16) NotEqualMasked(y Int16x16, mask Mask16x16) Mask16x16 +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Int16x16) OnesCountMasked(mask Mask16x16) Int16x16 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPW, CPU Feature: AVX512BW -func (x Int16x32) NotEqualMasked(y Int16x32, mask Mask16x32) Mask16x32 +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Int16x32) OnesCountMasked(mask Mask16x32) Int16x32 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPD, CPU Feature: AVX512F -func (x Int32x4) NotEqualMasked(y Int32x4, mask Mask32x4) Mask32x4 +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Int32x4) OnesCountMasked(mask Mask32x4) Int32x4 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPD, CPU Feature: AVX512F -func (x Int32x8) NotEqualMasked(y Int32x8, mask Mask32x8) Mask32x8 +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Int32x8) OnesCountMasked(mask Mask32x8) Int32x8 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPD, CPU Feature: AVX512F -func (x Int32x16) NotEqualMasked(y Int32x16, mask Mask32x16) Mask32x16 +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Int32x16) OnesCountMasked(mask Mask32x16) Int32x16 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPQ, CPU Feature: AVX512F -func (x Int64x2) NotEqualMasked(y Int64x2, mask Mask64x2) Mask64x2 +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Int64x2) OnesCountMasked(mask Mask64x2) Int64x2 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPQ, CPU Feature: AVX512F -func (x Int64x4) NotEqualMasked(y Int64x4, mask Mask64x4) Mask64x4 +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Int64x4) OnesCountMasked(mask Mask64x4) Int64x4 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPQ, CPU Feature: AVX512F -func (x Int64x8) NotEqualMasked(y Int64x8, mask Mask64x8) Mask64x8 +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Int64x8) OnesCountMasked(mask Mask64x8) Int64x8 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUB, CPU Feature: AVX512BW -func (x Uint8x16) NotEqualMasked(y Uint8x16, mask Mask8x16) Mask8x16 +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Uint8x16) OnesCountMasked(mask Mask8x16) Uint8x16 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUB, CPU Feature: AVX512BW -func (x Uint8x32) NotEqualMasked(y Uint8x32, mask Mask8x32) Mask8x32 +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Uint8x32) OnesCountMasked(mask Mask8x32) Uint8x32 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUB, CPU Feature: AVX512BW -func (x Uint8x64) NotEqualMasked(y Uint8x64, mask Mask8x64) Mask8x64 +// Asm: VPOPCNTB, CPU Feature: AVX512BITALG +func (x Uint8x64) OnesCountMasked(mask Mask8x64) Uint8x64 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUW, CPU Feature: AVX512BW -func (x Uint16x8) NotEqualMasked(y Uint16x8, mask Mask16x8) Mask16x8 +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Uint16x8) OnesCountMasked(mask Mask16x8) Uint16x8 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUW, CPU Feature: AVX512BW -func (x Uint16x16) NotEqualMasked(y Uint16x16, mask Mask16x16) Mask16x16 +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Uint16x16) OnesCountMasked(mask Mask16x16) Uint16x16 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUW, CPU Feature: AVX512BW -func (x Uint16x32) NotEqualMasked(y Uint16x32, mask Mask16x32) Mask16x32 +// Asm: VPOPCNTW, CPU Feature: AVX512BITALG +func (x Uint16x32) OnesCountMasked(mask Mask16x32) Uint16x32 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUD, CPU Feature: AVX512F -func (x Uint32x4) NotEqualMasked(y Uint32x4, mask Mask32x4) Mask32x4 +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Uint32x4) OnesCountMasked(mask Mask32x4) Uint32x4 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUD, CPU Feature: AVX512F -func (x Uint32x8) NotEqualMasked(y Uint32x8, mask Mask32x8) Mask32x8 +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Uint32x8) OnesCountMasked(mask Mask32x8) Uint32x8 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUD, CPU Feature: AVX512F -func (x Uint32x16) NotEqualMasked(y Uint32x16, mask Mask32x16) Mask32x16 +// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ +func (x Uint32x16) OnesCountMasked(mask Mask32x16) Uint32x16 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUQ, CPU Feature: AVX512F -func (x Uint64x2) NotEqualMasked(y Uint64x2, mask Mask64x2) Mask64x2 +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Uint64x2) OnesCountMasked(mask Mask64x2) Uint64x2 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUQ, CPU Feature: AVX512F -func (x Uint64x4) NotEqualMasked(y Uint64x4, mask Mask64x4) Mask64x4 +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Uint64x4) OnesCountMasked(mask Mask64x4) Uint64x4 -// NotEqualMasked compares for inequality. +// OnesCountMasked counts the number of set bits in each element. // // This operation is applied selectively under a write mask. // -// Asm: VPCMPUQ, CPU Feature: AVX512F -func (x Uint64x8) NotEqualMasked(y Uint64x8, mask Mask64x8) Mask64x8 +// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ +func (x Uint64x8) OnesCountMasked(mask Mask64x8) Uint64x8 /* Or */ @@ -6689,52 +7008,6 @@ func (x Uint64x4) OrMasked(y Uint64x4, mask Mask64x4) Uint64x4 // Asm: VPORQ, CPU Feature: AVX512F func (x Uint64x8) OrMasked(y Uint64x8, mask Mask64x8) Uint64x8 -/* PairDotProd */ - -// PairDotProd multiplies the elements and add the pairs together, -// yielding a vector of half as many elements with twice the input element size. -// -// Asm: VPMADDWD, CPU Feature: AVX -func (x Int16x8) PairDotProd(y Int16x8) Int32x4 - -// PairDotProd multiplies the elements and add the pairs together, -// yielding a vector of half as many elements with twice the input element size. -// -// Asm: VPMADDWD, CPU Feature: AVX2 -func (x Int16x16) PairDotProd(y Int16x16) Int32x8 - -// PairDotProd multiplies the elements and add the pairs together, -// yielding a vector of half as many elements with twice the input element size. -// -// Asm: VPMADDWD, CPU Feature: AVX512BW -func (x Int16x32) PairDotProd(y Int16x32) Int32x16 - -/* PairDotProdMasked */ - -// PairDotProdMasked multiplies the elements and add the pairs together, -// yielding a vector of half as many elements with twice the input element size. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMADDWD, CPU Feature: AVX512BW -func (x Int16x8) PairDotProdMasked(y Int16x8, mask Mask16x8) Int32x4 - -// PairDotProdMasked multiplies the elements and add the pairs together, -// yielding a vector of half as many elements with twice the input element size. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMADDWD, CPU Feature: AVX512BW -func (x Int16x16) PairDotProdMasked(y Int16x16, mask Mask16x16) Int32x8 - -// PairDotProdMasked multiplies the elements and add the pairs together, -// yielding a vector of half as many elements with twice the input element size. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMADDWD, CPU Feature: AVX512BW -func (x Int16x32) PairDotProdMasked(y Int16x32, mask Mask16x32) Int32x16 - /* Permute */ // Permute performs a full permutation of vector x using indices: @@ -7599,365 +7872,225 @@ func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // Only the needed bits to represent x's index are used in indices' elements. // -// This operation is applied selectively under a write mask. -// -// Asm: VPERMD, CPU Feature: AVX512F -func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16 - -// PermuteMasked performs a full permutation of vector x using indices: -// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPERMD, CPU Feature: AVX512F -func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16 - -// PermuteMasked performs a full permutation of vector x using indices: -// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPERMPD, CPU Feature: AVX512F -func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4 - -// PermuteMasked performs a full permutation of vector x using indices: -// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPERMQ, CPU Feature: AVX512F -func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4 - -// PermuteMasked performs a full permutation of vector x using indices: -// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPERMQ, CPU Feature: AVX512F -func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4 - -// PermuteMasked performs a full permutation of vector x using indices: -// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPERMPD, CPU Feature: AVX512F -func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8 - -// PermuteMasked performs a full permutation of vector x using indices: -// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPERMQ, CPU Feature: AVX512F -func (x Int64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Int64x8 - -// PermuteMasked performs a full permutation of vector x using indices: -// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPERMQ, CPU Feature: AVX512F -func (x Uint64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Uint64x8 - -/* PopCount */ - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Int8x16) PopCount() Int8x16 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Int8x32) PopCount() Int8x32 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Int8x64) PopCount() Int8x64 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Int16x8) PopCount() Int16x8 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Int16x16) PopCount() Int16x16 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Int16x32) PopCount() Int16x32 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Int32x4) PopCount() Int32x4 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Int32x8) PopCount() Int32x8 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Int32x16) PopCount() Int32x16 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Int64x2) PopCount() Int64x2 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Int64x4) PopCount() Int64x4 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Int64x8) PopCount() Int64x8 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Uint8x16) PopCount() Uint8x16 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Uint8x32) PopCount() Uint8x32 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Uint8x64) PopCount() Uint8x64 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Uint16x8) PopCount() Uint16x8 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Uint16x16) PopCount() Uint16x16 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Uint16x32) PopCount() Uint16x32 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Uint32x4) PopCount() Uint32x4 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Uint32x8) PopCount() Uint32x8 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Uint32x16) PopCount() Uint32x16 - -// PopCount counts the number of set bits in each element. -// -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Uint64x2) PopCount() Uint64x2 - -// PopCount counts the number of set bits in each element. +// This operation is applied selectively under a write mask. // -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Uint64x4) PopCount() Uint64x4 +// Asm: VPERMD, CPU Feature: AVX512F +func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16 -// PopCount counts the number of set bits in each element. +// PermuteMasked performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// Only the needed bits to represent x's index are used in indices' elements. // -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Uint64x8) PopCount() Uint64x8 - -/* PopCountMasked */ +// This operation is applied selectively under a write mask. +// +// Asm: VPERMD, CPU Feature: AVX512F +func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16 -// PopCountMasked counts the number of set bits in each element. +// PermuteMasked performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// Only the needed bits to represent x's index are used in indices' elements. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Int8x16) PopCountMasked(mask Mask8x16) Int8x16 +// Asm: VPERMPD, CPU Feature: AVX512F +func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4 -// PopCountMasked counts the number of set bits in each element. +// PermuteMasked performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// Only the needed bits to represent x's index are used in indices' elements. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Int8x32) PopCountMasked(mask Mask8x32) Int8x32 +// Asm: VPERMQ, CPU Feature: AVX512F +func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4 -// PopCountMasked counts the number of set bits in each element. +// PermuteMasked performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// Only the needed bits to represent x's index are used in indices' elements. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Int8x64) PopCountMasked(mask Mask8x64) Int8x64 +// Asm: VPERMQ, CPU Feature: AVX512F +func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4 -// PopCountMasked counts the number of set bits in each element. +// PermuteMasked performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// Only the needed bits to represent x's index are used in indices' elements. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Int16x8) PopCountMasked(mask Mask16x8) Int16x8 +// Asm: VPERMPD, CPU Feature: AVX512F +func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8 -// PopCountMasked counts the number of set bits in each element. +// PermuteMasked performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// Only the needed bits to represent x's index are used in indices' elements. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Int16x16) PopCountMasked(mask Mask16x16) Int16x16 +// Asm: VPERMQ, CPU Feature: AVX512F +func (x Int64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Int64x8 -// PopCountMasked counts the number of set bits in each element. +// PermuteMasked performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// Only the needed bits to represent x's index are used in indices' elements. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Int16x32) PopCountMasked(mask Mask16x32) Int16x32 +// Asm: VPERMQ, CPU Feature: AVX512F +func (x Uint64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Uint64x8 + +/* Reciprocal */ -// PopCountMasked counts the number of set bits in each element. +// Reciprocal computes an approximate reciprocal of each element. // -// This operation is applied selectively under a write mask. +// Asm: VRCPPS, CPU Feature: AVX +func (x Float32x4) Reciprocal() Float32x4 + +// Reciprocal computes an approximate reciprocal of each element. // -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Int32x4) PopCountMasked(mask Mask32x4) Int32x4 +// Asm: VRCPPS, CPU Feature: AVX +func (x Float32x8) Reciprocal() Float32x8 -// PopCountMasked counts the number of set bits in each element. +// Reciprocal computes an approximate reciprocal of each element. // -// This operation is applied selectively under a write mask. +// Asm: VRCP14PS, CPU Feature: AVX512F +func (x Float32x16) Reciprocal() Float32x16 + +// Reciprocal computes an approximate reciprocal of each element. // -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Int32x8) PopCountMasked(mask Mask32x8) Int32x8 +// Asm: VRCP14PD, CPU Feature: AVX512F +func (x Float64x2) Reciprocal() Float64x2 -// PopCountMasked counts the number of set bits in each element. +// Reciprocal computes an approximate reciprocal of each element. // -// This operation is applied selectively under a write mask. +// Asm: VRCP14PD, CPU Feature: AVX512F +func (x Float64x4) Reciprocal() Float64x4 + +// Reciprocal computes an approximate reciprocal of each element. // -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Int32x16) PopCountMasked(mask Mask32x16) Int32x16 +// Asm: VRCP14PD, CPU Feature: AVX512F +func (x Float64x8) Reciprocal() Float64x8 + +/* ReciprocalMasked */ -// PopCountMasked counts the number of set bits in each element. +// ReciprocalMasked computes an approximate reciprocal of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Int64x2) PopCountMasked(mask Mask64x2) Int64x2 +// Asm: VRCP14PS, CPU Feature: AVX512F +func (x Float32x4) ReciprocalMasked(mask Mask32x4) Float32x4 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalMasked computes an approximate reciprocal of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Int64x4) PopCountMasked(mask Mask64x4) Int64x4 +// Asm: VRCP14PS, CPU Feature: AVX512F +func (x Float32x8) ReciprocalMasked(mask Mask32x8) Float32x8 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalMasked computes an approximate reciprocal of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Int64x8) PopCountMasked(mask Mask64x8) Int64x8 +// Asm: VRCP14PS, CPU Feature: AVX512F +func (x Float32x16) ReciprocalMasked(mask Mask32x16) Float32x16 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalMasked computes an approximate reciprocal of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Uint8x16) PopCountMasked(mask Mask8x16) Uint8x16 +// Asm: VRCP14PD, CPU Feature: AVX512F +func (x Float64x2) ReciprocalMasked(mask Mask64x2) Float64x2 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalMasked computes an approximate reciprocal of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Uint8x32) PopCountMasked(mask Mask8x32) Uint8x32 +// Asm: VRCP14PD, CPU Feature: AVX512F +func (x Float64x4) ReciprocalMasked(mask Mask64x4) Float64x4 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalMasked computes an approximate reciprocal of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTB, CPU Feature: AVX512BITALG -func (x Uint8x64) PopCountMasked(mask Mask8x64) Uint8x64 +// Asm: VRCP14PD, CPU Feature: AVX512F +func (x Float64x8) ReciprocalMasked(mask Mask64x8) Float64x8 -// PopCountMasked counts the number of set bits in each element. +/* ReciprocalSqrt */ + +// ReciprocalSqrt computes an approximate reciprocal of the square root of each element. // -// This operation is applied selectively under a write mask. +// Asm: VRSQRTPS, CPU Feature: AVX +func (x Float32x4) ReciprocalSqrt() Float32x4 + +// ReciprocalSqrt computes an approximate reciprocal of the square root of each element. // -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Uint16x8) PopCountMasked(mask Mask16x8) Uint16x8 +// Asm: VRSQRTPS, CPU Feature: AVX +func (x Float32x8) ReciprocalSqrt() Float32x8 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalSqrt computes an approximate reciprocal of the square root of each element. // -// This operation is applied selectively under a write mask. +// Asm: VRSQRT14PS, CPU Feature: AVX512F +func (x Float32x16) ReciprocalSqrt() Float32x16 + +// ReciprocalSqrt computes an approximate reciprocal of the square root of each element. // -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Uint16x16) PopCountMasked(mask Mask16x16) Uint16x16 +// Asm: VRSQRT14PD, CPU Feature: AVX512F +func (x Float64x2) ReciprocalSqrt() Float64x2 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalSqrt computes an approximate reciprocal of the square root of each element. // -// This operation is applied selectively under a write mask. +// Asm: VRSQRT14PD, CPU Feature: AVX512F +func (x Float64x4) ReciprocalSqrt() Float64x4 + +// ReciprocalSqrt computes an approximate reciprocal of the square root of each element. // -// Asm: VPOPCNTW, CPU Feature: AVX512BITALG -func (x Uint16x32) PopCountMasked(mask Mask16x32) Uint16x32 +// Asm: VRSQRT14PD, CPU Feature: AVX512F +func (x Float64x8) ReciprocalSqrt() Float64x8 -// PopCountMasked counts the number of set bits in each element. +/* ReciprocalSqrtMasked */ + +// ReciprocalSqrtMasked computes an approximate reciprocal of the square root of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Uint32x4) PopCountMasked(mask Mask32x4) Uint32x4 +// Asm: VRSQRT14PS, CPU Feature: AVX512F +func (x Float32x4) ReciprocalSqrtMasked(mask Mask32x4) Float32x4 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalSqrtMasked computes an approximate reciprocal of the square root of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Uint32x8) PopCountMasked(mask Mask32x8) Uint32x8 +// Asm: VRSQRT14PS, CPU Feature: AVX512F +func (x Float32x8) ReciprocalSqrtMasked(mask Mask32x8) Float32x8 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalSqrtMasked computes an approximate reciprocal of the square root of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ -func (x Uint32x16) PopCountMasked(mask Mask32x16) Uint32x16 +// Asm: VRSQRT14PS, CPU Feature: AVX512F +func (x Float32x16) ReciprocalSqrtMasked(mask Mask32x16) Float32x16 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalSqrtMasked computes an approximate reciprocal of the square root of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Uint64x2) PopCountMasked(mask Mask64x2) Uint64x2 +// Asm: VRSQRT14PD, CPU Feature: AVX512F +func (x Float64x2) ReciprocalSqrtMasked(mask Mask64x2) Float64x2 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalSqrtMasked computes an approximate reciprocal of the square root of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Uint64x4) PopCountMasked(mask Mask64x4) Uint64x4 +// Asm: VRSQRT14PD, CPU Feature: AVX512F +func (x Float64x4) ReciprocalSqrtMasked(mask Mask64x4) Float64x4 -// PopCountMasked counts the number of set bits in each element. +// ReciprocalSqrtMasked computes an approximate reciprocal of the square root of each element. // // This operation is applied selectively under a write mask. // -// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ -func (x Uint64x8) PopCountMasked(mask Mask64x8) Uint64x8 +// Asm: VRSQRT14PD, CPU Feature: AVX512F +func (x Float64x8) ReciprocalSqrtMasked(mask Mask64x8) Float64x8 /* RotateAllLeft */ @@ -8647,353 +8780,227 @@ func (x Uint64x4) RotateRightMasked(y Uint64x4, mask Mask64x4) Uint64x4 // Asm: VPRORVQ, CPU Feature: AVX512F func (x Uint64x8) RotateRightMasked(y Uint64x8, mask Mask64x8) Uint64x8 -/* Round */ +/* RoundToEven */ -// Round rounds elements to the nearest integer. +// RoundToEven rounds elements to the nearest integer. // // Asm: VROUNDPS, CPU Feature: AVX -func (x Float32x4) Round() Float32x4 +func (x Float32x4) RoundToEven() Float32x4 -// Round rounds elements to the nearest integer. +// RoundToEven rounds elements to the nearest integer. // // Asm: VROUNDPS, CPU Feature: AVX -func (x Float32x8) Round() Float32x8 +func (x Float32x8) RoundToEven() Float32x8 -// Round rounds elements to the nearest integer. +// RoundToEven rounds elements to the nearest integer. // // Asm: VROUNDPD, CPU Feature: AVX -func (x Float64x2) Round() Float64x2 +func (x Float64x2) RoundToEven() Float64x2 -// Round rounds elements to the nearest integer. +// RoundToEven rounds elements to the nearest integer. // // Asm: VROUNDPD, CPU Feature: AVX -func (x Float64x4) Round() Float64x4 +func (x Float64x4) RoundToEven() Float64x4 -/* RoundScaled */ +/* RoundToEvenScaled */ -// RoundScaled rounds elements with specified precision. +// RoundToEvenScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) RoundScaled(prec uint8) Float32x4 +func (x Float32x4) RoundToEvenScaled(prec uint8) Float32x4 -// RoundScaled rounds elements with specified precision. +// RoundToEvenScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) RoundScaled(prec uint8) Float32x8 +func (x Float32x8) RoundToEvenScaled(prec uint8) Float32x8 -// RoundScaled rounds elements with specified precision. +// RoundToEvenScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) RoundScaled(prec uint8) Float32x16 +func (x Float32x16) RoundToEvenScaled(prec uint8) Float32x16 -// RoundScaled rounds elements with specified precision. +// RoundToEvenScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) RoundScaled(prec uint8) Float64x2 +func (x Float64x2) RoundToEvenScaled(prec uint8) Float64x2 -// RoundScaled rounds elements with specified precision. +// RoundToEvenScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) RoundScaled(prec uint8) Float64x4 +func (x Float64x4) RoundToEvenScaled(prec uint8) Float64x4 -// RoundScaled rounds elements with specified precision. +// RoundToEvenScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) RoundScaled(prec uint8) Float64x8 +func (x Float64x8) RoundToEvenScaled(prec uint8) Float64x8 -/* RoundScaledMasked */ +/* RoundToEvenScaledMasked */ -// RoundScaledMasked rounds elements with specified precision. +// RoundToEvenScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) RoundScaledMasked(prec uint8, mask Mask32x4) Float32x4 +func (x Float32x4) RoundToEvenScaledMasked(prec uint8, mask Mask32x4) Float32x4 -// RoundScaledMasked rounds elements with specified precision. +// RoundToEvenScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) RoundScaledMasked(prec uint8, mask Mask32x8) Float32x8 +func (x Float32x8) RoundToEvenScaledMasked(prec uint8, mask Mask32x8) Float32x8 -// RoundScaledMasked rounds elements with specified precision. +// RoundToEvenScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) RoundScaledMasked(prec uint8, mask Mask32x16) Float32x16 +func (x Float32x16) RoundToEvenScaledMasked(prec uint8, mask Mask32x16) Float32x16 -// RoundScaledMasked rounds elements with specified precision. +// RoundToEvenScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) RoundScaledMasked(prec uint8, mask Mask64x2) Float64x2 +func (x Float64x2) RoundToEvenScaledMasked(prec uint8, mask Mask64x2) Float64x2 -// RoundScaledMasked rounds elements with specified precision. +// RoundToEvenScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) RoundScaledMasked(prec uint8, mask Mask64x4) Float64x4 +func (x Float64x4) RoundToEvenScaledMasked(prec uint8, mask Mask64x4) Float64x4 -// RoundScaledMasked rounds elements with specified precision. +// RoundToEvenScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) RoundScaledMasked(prec uint8, mask Mask64x8) Float64x8 +func (x Float64x8) RoundToEvenScaledMasked(prec uint8, mask Mask64x8) Float64x8 -/* RoundScaledResidue */ +/* RoundToEvenScaledResidue */ -// RoundScaledResidue computes the difference after rounding with specified precision. +// RoundToEvenScaledResidue computes the difference after rounding with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) RoundScaledResidue(prec uint8) Float32x4 +func (x Float32x4) RoundToEvenScaledResidue(prec uint8) Float32x4 -// RoundScaledResidue computes the difference after rounding with specified precision. +// RoundToEvenScaledResidue computes the difference after rounding with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) RoundScaledResidue(prec uint8) Float32x8 +func (x Float32x8) RoundToEvenScaledResidue(prec uint8) Float32x8 -// RoundScaledResidue computes the difference after rounding with specified precision. +// RoundToEvenScaledResidue computes the difference after rounding with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) RoundScaledResidue(prec uint8) Float32x16 +func (x Float32x16) RoundToEvenScaledResidue(prec uint8) Float32x16 -// RoundScaledResidue computes the difference after rounding with specified precision. +// RoundToEvenScaledResidue computes the difference after rounding with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) RoundScaledResidue(prec uint8) Float64x2 +func (x Float64x2) RoundToEvenScaledResidue(prec uint8) Float64x2 -// RoundScaledResidue computes the difference after rounding with specified precision. +// RoundToEvenScaledResidue computes the difference after rounding with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) RoundScaledResidue(prec uint8) Float64x4 +func (x Float64x4) RoundToEvenScaledResidue(prec uint8) Float64x4 -// RoundScaledResidue computes the difference after rounding with specified precision. +// RoundToEvenScaledResidue computes the difference after rounding with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) RoundScaledResidue(prec uint8) Float64x8 +func (x Float64x8) RoundToEvenScaledResidue(prec uint8) Float64x8 -/* RoundScaledResidueMasked */ +/* RoundToEvenScaledResidueMasked */ -// RoundScaledResidueMasked computes the difference after rounding with specified precision. +// RoundToEvenScaledResidueMasked computes the difference after rounding with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) RoundScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4 +func (x Float32x4) RoundToEvenScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4 -// RoundScaledResidueMasked computes the difference after rounding with specified precision. +// RoundToEvenScaledResidueMasked computes the difference after rounding with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) RoundScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8 +func (x Float32x8) RoundToEvenScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8 -// RoundScaledResidueMasked computes the difference after rounding with specified precision. +// RoundToEvenScaledResidueMasked computes the difference after rounding with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) RoundScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16 +func (x Float32x16) RoundToEvenScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16 -// RoundScaledResidueMasked computes the difference after rounding with specified precision. +// RoundToEvenScaledResidueMasked computes the difference after rounding with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) RoundScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2 +func (x Float64x2) RoundToEvenScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2 -// RoundScaledResidueMasked computes the difference after rounding with specified precision. +// RoundToEvenScaledResidueMasked computes the difference after rounding with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) RoundScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4 +func (x Float64x4) RoundToEvenScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4 -// RoundScaledResidueMasked computes the difference after rounding with specified precision. +// RoundToEvenScaledResidueMasked computes the difference after rounding with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) RoundScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8 - -/* SaturatedAddDotProd */ - -// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x. -// -// Asm: VPDPWSSDS, CPU Feature: AVXVNNI -func (x Int32x4) SaturatedAddDotProd(y Int16x8, z Int16x8) Int32x4 - -// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x. -// -// Asm: VPDPWSSDS, CPU Feature: AVXVNNI -func (x Int32x8) SaturatedAddDotProd(y Int16x16, z Int16x16) Int32x8 - -// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x16) SaturatedAddDotProd(y Int16x32, z Int16x32) Int32x16 - -/* SaturatedAddDotProdMasked */ - -// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x4) SaturatedAddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4 - -// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x8) SaturatedAddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8 - -// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x16) SaturatedAddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16 - -/* SaturatedUnsignedSignedPairDotProd */ - -// SaturatedUnsignedSignedPairDotProd multiplies the elements and add the pairs together with saturation, -// yielding a vector of half as many elements with twice the input element size. -// -// Asm: VPMADDUBSW, CPU Feature: AVX -func (x Uint8x16) SaturatedUnsignedSignedPairDotProd(y Int8x16) Int16x8 - -// SaturatedUnsignedSignedPairDotProd multiplies the elements and add the pairs together with saturation, -// yielding a vector of half as many elements with twice the input element size. -// -// Asm: VPMADDUBSW, CPU Feature: AVX2 -func (x Uint8x32) SaturatedUnsignedSignedPairDotProd(y Int8x32) Int16x16 - -// SaturatedUnsignedSignedPairDotProd multiplies the elements and add the pairs together with saturation, -// yielding a vector of half as many elements with twice the input element size. -// -// Asm: VPMADDUBSW, CPU Feature: AVX512BW -func (x Uint8x64) SaturatedUnsignedSignedPairDotProd(y Int8x64) Int16x32 - -/* SaturatedUnsignedSignedPairDotProdMasked */ - -// SaturatedUnsignedSignedPairDotProdMasked multiplies the elements and add the pairs together with saturation, -// yielding a vector of half as many elements with twice the input element size. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMADDUBSW, CPU Feature: AVX512BW -func (x Uint8x16) SaturatedUnsignedSignedPairDotProdMasked(y Int8x16, mask Mask16x8) Int16x8 - -// SaturatedUnsignedSignedPairDotProdMasked multiplies the elements and add the pairs together with saturation, -// yielding a vector of half as many elements with twice the input element size. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMADDUBSW, CPU Feature: AVX512BW -func (x Uint8x32) SaturatedUnsignedSignedPairDotProdMasked(y Int8x32, mask Mask16x16) Int16x16 - -// SaturatedUnsignedSignedPairDotProdMasked multiplies the elements and add the pairs together with saturation, -// yielding a vector of half as many elements with twice the input element size. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMADDUBSW, CPU Feature: AVX512BW -func (x Uint8x64) SaturatedUnsignedSignedPairDotProdMasked(y Int8x64, mask Mask16x32) Int16x32 - -/* SaturatedUnsignedSignedQuadDotProdAccumulate */ - -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z. -// -// Asm: VPDPBUSDS, CPU Feature: AVXVNNI -func (x Int8x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4) Int32x4 - -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z. -// -// Asm: VPDPBUSDS, CPU Feature: AVXVNNI -func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8) Int32x8 - -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z. -// -// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16 - -/* SaturatedUnsignedSignedQuadDotProdAccumulateMasked */ - -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int8x16) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int32x4, mask Mask32x4) Int32x4 - -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int32x8, mask Mask32x8) Int32x8 - -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16 +func (x Float64x8) RoundToEvenScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8 /* Scale */ @@ -11381,44 +11388,6 @@ func (x Uint64x4) ShiftRightMasked(y Uint64x4, mask Mask64x4) Uint64x4 // Asm: VPSRLVQ, CPU Feature: AVX512F func (x Uint64x8) ShiftRightMasked(y Uint64x8, mask Mask64x8) Uint64x8 -/* Sign */ - -// Sign returns the product of the first operand with -1, 0, or 1, -// whichever constant is nearest to the value of the second operand. -// -// Asm: VPSIGNB, CPU Feature: AVX -func (x Int8x16) Sign(y Int8x16) Int8x16 - -// Sign returns the product of the first operand with -1, 0, or 1, -// whichever constant is nearest to the value of the second operand. -// -// Asm: VPSIGNB, CPU Feature: AVX2 -func (x Int8x32) Sign(y Int8x32) Int8x32 - -// Sign returns the product of the first operand with -1, 0, or 1, -// whichever constant is nearest to the value of the second operand. -// -// Asm: VPSIGNW, CPU Feature: AVX -func (x Int16x8) Sign(y Int16x8) Int16x8 - -// Sign returns the product of the first operand with -1, 0, or 1, -// whichever constant is nearest to the value of the second operand. -// -// Asm: VPSIGNW, CPU Feature: AVX2 -func (x Int16x16) Sign(y Int16x16) Int16x16 - -// Sign returns the product of the first operand with -1, 0, or 1, -// whichever constant is nearest to the value of the second operand. -// -// Asm: VPSIGND, CPU Feature: AVX -func (x Int32x4) Sign(y Int32x4) Int32x4 - -// Sign returns the product of the first operand with -1, 0, or 1, -// whichever constant is nearest to the value of the second operand. -// -// Asm: VPSIGND, CPU Feature: AVX2 -func (x Int32x8) Sign(y Int32x8) Int32x8 - /* Sqrt */ // Sqrt computes the square root of each element. @@ -11981,32 +11950,32 @@ func (x Int16x32) SubSaturated(y Int16x32) Int16x32 // SubSaturated subtracts corresponding elements of two vectors with saturation. // -// Asm: VPSUBSB, CPU Feature: AVX +// Asm: VPSUBUSB, CPU Feature: AVX func (x Uint8x16) SubSaturated(y Uint8x16) Uint8x16 // SubSaturated subtracts corresponding elements of two vectors with saturation. // -// Asm: VPSUBSB, CPU Feature: AVX2 +// Asm: VPSUBUSB, CPU Feature: AVX2 func (x Uint8x32) SubSaturated(y Uint8x32) Uint8x32 // SubSaturated subtracts corresponding elements of two vectors with saturation. // -// Asm: VPSUBSB, CPU Feature: AVX512BW +// Asm: VPSUBUSB, CPU Feature: AVX512BW func (x Uint8x64) SubSaturated(y Uint8x64) Uint8x64 // SubSaturated subtracts corresponding elements of two vectors with saturation. // -// Asm: VPSUBSW, CPU Feature: AVX +// Asm: VPSUBUSW, CPU Feature: AVX func (x Uint16x8) SubSaturated(y Uint16x8) Uint16x8 // SubSaturated subtracts corresponding elements of two vectors with saturation. // -// Asm: VPSUBSW, CPU Feature: AVX2 +// Asm: VPSUBUSW, CPU Feature: AVX2 func (x Uint16x16) SubSaturated(y Uint16x16) Uint16x16 // SubSaturated subtracts corresponding elements of two vectors with saturation. // -// Asm: VPSUBSW, CPU Feature: AVX512BW +// Asm: VPSUBUSW, CPU Feature: AVX512BW func (x Uint16x32) SubSaturated(y Uint16x32) Uint16x32 /* SubSaturatedMasked */ @@ -12057,42 +12026,42 @@ func (x Int16x32) SubSaturatedMasked(y Int16x32, mask Mask16x32) Int16x32 // // This operation is applied selectively under a write mask. // -// Asm: VPSUBSB, CPU Feature: AVX512BW +// Asm: VPSUBUSB, CPU Feature: AVX512BW func (x Uint8x16) SubSaturatedMasked(y Uint8x16, mask Mask8x16) Uint8x16 // SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPSUBSB, CPU Feature: AVX512BW +// Asm: VPSUBUSB, CPU Feature: AVX512BW func (x Uint8x32) SubSaturatedMasked(y Uint8x32, mask Mask8x32) Uint8x32 // SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPSUBSB, CPU Feature: AVX512BW +// Asm: VPSUBUSB, CPU Feature: AVX512BW func (x Uint8x64) SubSaturatedMasked(y Uint8x64, mask Mask8x64) Uint8x64 // SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPSUBSW, CPU Feature: AVX512BW +// Asm: VPSUBUSW, CPU Feature: AVX512BW func (x Uint16x8) SubSaturatedMasked(y Uint16x8, mask Mask16x8) Uint16x8 // SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPSUBSW, CPU Feature: AVX512BW +// Asm: VPSUBUSW, CPU Feature: AVX512BW func (x Uint16x16) SubSaturatedMasked(y Uint16x16, mask Mask16x16) Uint16x16 // SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. // // This operation is applied selectively under a write mask. // -// Asm: VPSUBSW, CPU Feature: AVX512BW +// Asm: VPSUBUSW, CPU Feature: AVX512BW func (x Uint16x32) SubSaturatedMasked(y Uint16x32, mask Mask16x32) Uint16x32 /* Trunc */ @@ -12317,46 +12286,6 @@ func (x Float64x4) TruncScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4 // Asm: VREDUCEPD, CPU Feature: AVX512DQ func (x Float64x8) TruncScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8 -/* UnsignedSignedQuadDotProdAccumulate */ - -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of x and y and then adds z. -// -// Asm: VPDPBUSD, CPU Feature: AVXVNNI -func (x Int8x16) UnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4) Int32x4 - -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of x and y and then adds z. -// -// Asm: VPDPBUSD, CPU Feature: AVXVNNI -func (x Int8x32) UnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8) Int32x8 - -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of x and y and then adds z. -// -// Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int8x64) UnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16 - -/* UnsignedSignedQuadDotProdAccumulateMasked */ - -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int8x16) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int32x4, mask Mask32x4) Int32x4 - -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int8x32) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int32x8, mask Mask32x8) Int32x8 - -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int8x64) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16 - /* Xor */ // Xor performs a bitwise XOR operation between two vectors. diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 7776a8afda..4c3817599e 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -203,25 +203,6 @@ func TestExpand(t *testing.T) { } } -func TestPairDotProdAccumulate(t *testing.T) { - if !simd.HasAVX512GFNI() { - // TODO: this function is actually VNNI, let's implement and call the right check. - t.Skip("Test requires HasAVX512GFNI, not available on this hardware") - return - } - x := simd.LoadInt16x8Slice([]int16{2, 2, 2, 2, 2, 2, 2, 2}) - z := simd.LoadInt32x4Slice([]int32{3, 3, 3, 3}) - want := []int32{11, 11, 11, 11} - got := make([]int32, 4) - z = z.AddDotProd(x, x) - z.StoreSlice(got) - for i := range 4 { - if got[i] != want[i] { - t.Errorf("a and b differ at index %d, got=%d, want=%d", i, got[i], want[i]) - } - } -} - var testShiftAllVal uint64 = 3 func TestShiftAll(t *testing.T) { diff --git a/src/simd/ternary_test.go b/src/simd/ternary_test.go index 9ce0ff7676..2374635917 100644 --- a/src/simd/ternary_test.go +++ b/src/simd/ternary_test.go @@ -13,11 +13,11 @@ import ( func TestFMA(t *testing.T) { if simd.HasAVX512() { - testFloat32x4TernaryFlaky(t, simd.Float32x4.FusedMultiplyAdd, fmaSlice[float32], 0.001) - testFloat32x8TernaryFlaky(t, simd.Float32x8.FusedMultiplyAdd, fmaSlice[float32], 0.001) - testFloat32x16TernaryFlaky(t, simd.Float32x16.FusedMultiplyAdd, fmaSlice[float32], 0.001) - testFloat64x2Ternary(t, simd.Float64x2.FusedMultiplyAdd, fmaSlice[float64]) - testFloat64x4Ternary(t, simd.Float64x4.FusedMultiplyAdd, fmaSlice[float64]) - testFloat64x8Ternary(t, simd.Float64x8.FusedMultiplyAdd, fmaSlice[float64]) + testFloat32x4TernaryFlaky(t, simd.Float32x4.MulAdd, fmaSlice[float32], 0.001) + testFloat32x8TernaryFlaky(t, simd.Float32x8.MulAdd, fmaSlice[float32], 0.001) + testFloat32x16TernaryFlaky(t, simd.Float32x16.MulAdd, fmaSlice[float32], 0.001) + testFloat64x2Ternary(t, simd.Float64x2.MulAdd, fmaSlice[float64]) + testFloat64x4Ternary(t, simd.Float64x4.MulAdd, fmaSlice[float64]) + testFloat64x8Ternary(t, simd.Float64x8.MulAdd, fmaSlice[float64]) } } diff --git a/src/simd/unary_test.go b/src/simd/unary_test.go index c9fdfff0ff..5709ca73c7 100644 --- a/src/simd/unary_test.go +++ b/src/simd/unary_test.go @@ -46,10 +46,10 @@ func TestTrunc(t *testing.T) { } func TestRound(t *testing.T) { - testFloat32x4Unary(t, simd.Float32x4.Round, roundSlice[float32]) - testFloat32x8Unary(t, simd.Float32x8.Round, roundSlice[float32]) - testFloat64x2Unary(t, simd.Float64x2.Round, roundSlice[float64]) - testFloat64x4Unary(t, simd.Float64x4.Round, roundSlice[float64]) + testFloat32x4Unary(t, simd.Float32x4.RoundToEven, roundSlice[float32]) + testFloat32x8Unary(t, simd.Float32x8.RoundToEven, roundSlice[float32]) + testFloat64x2Unary(t, simd.Float64x2.RoundToEven, roundSlice[float64]) + testFloat64x4Unary(t, simd.Float64x4.RoundToEven, roundSlice[float64]) if simd.HasAVX512() { // testFloat32x16Unary(t, simd.Float32x16.Round, roundSlice[float32]) // missing // testFloat64x8Unary(t, simd.Float64x8.Round, roundSlice[float64]) // missing @@ -68,19 +68,19 @@ func TestSqrt(t *testing.T) { } func TestAbsolute(t *testing.T) { - testInt8x16Unary(t, simd.Int8x16.Absolute, map1[int8](abs)) - testInt8x32Unary(t, simd.Int8x32.Absolute, map1[int8](abs)) - testInt16x8Unary(t, simd.Int16x8.Absolute, map1[int16](abs)) - testInt16x16Unary(t, simd.Int16x16.Absolute, map1[int16](abs)) - testInt32x4Unary(t, simd.Int32x4.Absolute, map1[int32](abs)) - testInt32x8Unary(t, simd.Int32x8.Absolute, map1[int32](abs)) + testInt8x16Unary(t, simd.Int8x16.Abs, map1[int8](abs)) + testInt8x32Unary(t, simd.Int8x32.Abs, map1[int8](abs)) + testInt16x8Unary(t, simd.Int16x8.Abs, map1[int16](abs)) + testInt16x16Unary(t, simd.Int16x16.Abs, map1[int16](abs)) + testInt32x4Unary(t, simd.Int32x4.Abs, map1[int32](abs)) + testInt32x8Unary(t, simd.Int32x8.Abs, map1[int32](abs)) if simd.HasAVX512() { - testInt8x64Unary(t, simd.Int8x64.Absolute, map1[int8](abs)) - testInt16x32Unary(t, simd.Int16x32.Absolute, map1[int16](abs)) - testInt32x16Unary(t, simd.Int32x16.Absolute, map1[int32](abs)) - testInt64x2Unary(t, simd.Int64x2.Absolute, map1[int64](abs)) - testInt64x4Unary(t, simd.Int64x4.Absolute, map1[int64](abs)) - testInt64x8Unary(t, simd.Int64x8.Absolute, map1[int64](abs)) + testInt8x64Unary(t, simd.Int8x64.Abs, map1[int8](abs)) + testInt16x32Unary(t, simd.Int16x32.Abs, map1[int16](abs)) + testInt32x16Unary(t, simd.Int32x16.Abs, map1[int32](abs)) + testInt64x2Unary(t, simd.Int64x2.Abs, map1[int64](abs)) + testInt64x4Unary(t, simd.Int64x4.Abs, map1[int64](abs)) + testInt64x8Unary(t, simd.Int64x8.Abs, map1[int64](abs)) } } -- 2.52.0