ssa.OpAMD64VPOR256,
ssa.OpAMD64VPORD512,
ssa.OpAMD64VPORQ512,
- ssa.OpAMD64VPSHUFB128,
+ ssa.OpAMD64VPERMB128,
ssa.OpAMD64VPERMB256,
ssa.OpAMD64VPERMB512,
ssa.OpAMD64VPERMW128,
ssa.OpAMD64VPERMQ256,
ssa.OpAMD64VPERMPD512,
ssa.OpAMD64VPERMQ512,
+ ssa.OpAMD64VPSHUFB128,
ssa.OpAMD64VPSHUFB256,
ssa.OpAMD64VPSHUFB512,
ssa.OpAMD64VPROLVD128,
ssa.OpAMD64VPORQMasked128,
ssa.OpAMD64VPORQMasked256,
ssa.OpAMD64VPORQMasked512,
- ssa.OpAMD64VPSHUFBMasked256,
- ssa.OpAMD64VPSHUFBMasked512,
- ssa.OpAMD64VPSHUFBMasked128,
+ ssa.OpAMD64VPERMBMasked128,
ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512,
ssa.OpAMD64VPERMWMasked128,
ssa.OpAMD64VPERMQMasked256,
ssa.OpAMD64VPERMPDMasked512,
ssa.OpAMD64VPERMQMasked512,
+ ssa.OpAMD64VPSHUFBMasked256,
+ ssa.OpAMD64VPSHUFBMasked512,
+ ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPROLVDMasked128,
ssa.OpAMD64VPROLVDMasked256,
ssa.OpAMD64VPROLVDMasked512,
ssa.OpAMD64VEXTRACTF64X4256,
ssa.OpAMD64VEXTRACTI128128,
ssa.OpAMD64VEXTRACTI64X4256,
- ssa.OpAMD64VPSHUFD128,
- ssa.OpAMD64VPSHUFD256,
- ssa.OpAMD64VPSHUFD512,
- ssa.OpAMD64VPSHUFHW128,
- ssa.OpAMD64VPSHUFHW256,
- ssa.OpAMD64VPSHUFHW512,
ssa.OpAMD64VPROLD128,
ssa.OpAMD64VPROLD256,
ssa.OpAMD64VPROLD512,
ssa.OpAMD64VPRORQ128,
ssa.OpAMD64VPRORQ256,
ssa.OpAMD64VPRORQ512,
+ ssa.OpAMD64VPSHUFD128,
+ ssa.OpAMD64VPSHUFD256,
+ ssa.OpAMD64VPSHUFD512,
+ ssa.OpAMD64VPSHUFHW128,
+ ssa.OpAMD64VPSHUFHW256,
+ ssa.OpAMD64VPSHUFHW512,
+ ssa.OpAMD64VPSHUFLW128,
+ ssa.OpAMD64VPSHUFLW256,
+ ssa.OpAMD64VPSHUFLW512,
ssa.OpAMD64VPSLLW128const,
ssa.OpAMD64VPSLLW256const,
ssa.OpAMD64VPSLLW512const,
ssa.OpAMD64VREDUCEPDMasked128,
ssa.OpAMD64VREDUCEPDMasked256,
ssa.OpAMD64VREDUCEPDMasked512,
- ssa.OpAMD64VPSHUFDMasked256,
- ssa.OpAMD64VPSHUFDMasked512,
- ssa.OpAMD64VPSHUFHWMasked256,
- ssa.OpAMD64VPSHUFHWMasked512,
- ssa.OpAMD64VPSHUFHWMasked128,
- ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPROLDMasked128,
ssa.OpAMD64VPROLDMasked256,
ssa.OpAMD64VPROLDMasked512,
ssa.OpAMD64VPRORQMasked128,
ssa.OpAMD64VPRORQMasked256,
ssa.OpAMD64VPRORQMasked512,
+ ssa.OpAMD64VPSHUFDMasked256,
+ ssa.OpAMD64VPSHUFDMasked512,
+ ssa.OpAMD64VPSHUFHWMasked256,
+ ssa.OpAMD64VPSHUFHWMasked512,
+ ssa.OpAMD64VPSHUFHWMasked128,
+ ssa.OpAMD64VPSHUFLWMasked256,
+ ssa.OpAMD64VPSHUFLWMasked512,
+ ssa.OpAMD64VPSHUFLWMasked128,
+ ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPSLLWMasked128const,
ssa.OpAMD64VPSLLWMasked256const,
ssa.OpAMD64VPSLLWMasked512const,
case ssa.OpAMD64VPDPWSSD128,
ssa.OpAMD64VPDPWSSD256,
ssa.OpAMD64VPDPWSSD512,
+ ssa.OpAMD64VPERMI2B128,
+ ssa.OpAMD64VPERMI2B256,
+ ssa.OpAMD64VPERMI2B512,
+ ssa.OpAMD64VPERMI2W128,
+ ssa.OpAMD64VPERMI2W256,
+ ssa.OpAMD64VPERMI2W512,
+ ssa.OpAMD64VPERMI2PS128,
+ ssa.OpAMD64VPERMI2D128,
+ ssa.OpAMD64VPERMI2PS256,
+ ssa.OpAMD64VPERMI2D256,
+ ssa.OpAMD64VPERMI2PS512,
+ ssa.OpAMD64VPERMI2D512,
+ ssa.OpAMD64VPERMI2PD128,
+ ssa.OpAMD64VPERMI2Q128,
+ ssa.OpAMD64VPERMI2PD256,
+ ssa.OpAMD64VPERMI2Q256,
+ ssa.OpAMD64VPERMI2PD512,
+ ssa.OpAMD64VPERMI2Q512,
ssa.OpAMD64VPDPBUSD128,
ssa.OpAMD64VPDPBUSD256,
ssa.OpAMD64VPDPBUSD512,
ssa.OpAMD64VFMSUBADD213PD128,
ssa.OpAMD64VFMSUBADD213PD256,
ssa.OpAMD64VFMSUBADD213PD512,
- ssa.OpAMD64VPERMI2B128,
- ssa.OpAMD64VPERMI2B256,
- ssa.OpAMD64VPERMI2B512,
- ssa.OpAMD64VPERMI2W128,
- ssa.OpAMD64VPERMI2W256,
- ssa.OpAMD64VPERMI2W512,
- ssa.OpAMD64VPERMI2PS128,
- ssa.OpAMD64VPERMI2D128,
- ssa.OpAMD64VPERMI2PS256,
- ssa.OpAMD64VPERMI2D256,
- ssa.OpAMD64VPERMI2PS512,
- ssa.OpAMD64VPERMI2D512,
- ssa.OpAMD64VPERMI2PD128,
- ssa.OpAMD64VPERMI2Q128,
- ssa.OpAMD64VPERMI2PD256,
- ssa.OpAMD64VPERMI2Q256,
- ssa.OpAMD64VPERMI2PD512,
- ssa.OpAMD64VPERMI2Q512,
ssa.OpAMD64VPSHLDVW128,
ssa.OpAMD64VPSHLDVW256,
ssa.OpAMD64VPSHLDVW512,
ssa.OpAMD64VPAVGWMasked128Merging,
ssa.OpAMD64VPAVGWMasked256Merging,
ssa.OpAMD64VPAVGWMasked512Merging,
+ ssa.OpAMD64VPERMI2BMasked128,
+ ssa.OpAMD64VPERMI2BMasked256,
+ ssa.OpAMD64VPERMI2BMasked512,
+ ssa.OpAMD64VPERMI2WMasked128,
+ ssa.OpAMD64VPERMI2WMasked256,
+ ssa.OpAMD64VPERMI2WMasked512,
+ ssa.OpAMD64VPERMI2PSMasked128,
+ ssa.OpAMD64VPERMI2DMasked128,
+ ssa.OpAMD64VPERMI2PSMasked256,
+ ssa.OpAMD64VPERMI2DMasked256,
+ ssa.OpAMD64VPERMI2PSMasked512,
+ ssa.OpAMD64VPERMI2DMasked512,
+ ssa.OpAMD64VPERMI2PDMasked128,
+ ssa.OpAMD64VPERMI2QMasked128,
+ ssa.OpAMD64VPERMI2PDMasked256,
+ ssa.OpAMD64VPERMI2QMasked256,
+ ssa.OpAMD64VPERMI2PDMasked512,
+ ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPALIGNRMasked256Merging,
ssa.OpAMD64VPALIGNRMasked512Merging,
ssa.OpAMD64VPALIGNRMasked128Merging,
ssa.OpAMD64VPORQMasked128Merging,
ssa.OpAMD64VPORQMasked256Merging,
ssa.OpAMD64VPORQMasked512Merging,
- ssa.OpAMD64VPERMI2BMasked128,
- ssa.OpAMD64VPERMI2BMasked256,
- ssa.OpAMD64VPERMI2BMasked512,
- ssa.OpAMD64VPERMI2WMasked128,
- ssa.OpAMD64VPERMI2WMasked256,
- ssa.OpAMD64VPERMI2WMasked512,
- ssa.OpAMD64VPERMI2PSMasked128,
- ssa.OpAMD64VPERMI2DMasked128,
- ssa.OpAMD64VPERMI2PSMasked256,
- ssa.OpAMD64VPERMI2DMasked256,
- ssa.OpAMD64VPERMI2PSMasked512,
- ssa.OpAMD64VPERMI2DMasked512,
- ssa.OpAMD64VPERMI2PDMasked128,
- ssa.OpAMD64VPERMI2QMasked128,
- ssa.OpAMD64VPERMI2PDMasked256,
- ssa.OpAMD64VPERMI2QMasked256,
- ssa.OpAMD64VPERMI2PDMasked512,
- ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPSHUFBMasked256Merging,
ssa.OpAMD64VPSHUFBMasked512Merging,
ssa.OpAMD64VPSHUFBMasked128Merging,
p = simdV21load(s, v)
case ssa.OpAMD64VPDPWSSD512load,
+ ssa.OpAMD64VPERMI2PS128load,
+ ssa.OpAMD64VPERMI2D128load,
+ ssa.OpAMD64VPERMI2PS256load,
+ ssa.OpAMD64VPERMI2D256load,
+ ssa.OpAMD64VPERMI2PS512load,
+ ssa.OpAMD64VPERMI2D512load,
+ ssa.OpAMD64VPERMI2PD128load,
+ ssa.OpAMD64VPERMI2Q128load,
+ ssa.OpAMD64VPERMI2PD256load,
+ ssa.OpAMD64VPERMI2Q256load,
+ ssa.OpAMD64VPERMI2PD512load,
+ ssa.OpAMD64VPERMI2Q512load,
ssa.OpAMD64VPDPBUSD512load,
ssa.OpAMD64VPDPBUSDS512load,
ssa.OpAMD64VFMADD213PS128load,
ssa.OpAMD64VFMSUBADD213PD128load,
ssa.OpAMD64VFMSUBADD213PD256load,
ssa.OpAMD64VFMSUBADD213PD512load,
- ssa.OpAMD64VPERMI2PS128load,
- ssa.OpAMD64VPERMI2D128load,
- ssa.OpAMD64VPERMI2PS256load,
- ssa.OpAMD64VPERMI2D256load,
- ssa.OpAMD64VPERMI2PS512load,
- ssa.OpAMD64VPERMI2D512load,
- ssa.OpAMD64VPERMI2PD128load,
- ssa.OpAMD64VPERMI2Q128load,
- ssa.OpAMD64VPERMI2PD256load,
- ssa.OpAMD64VPERMI2Q256load,
- ssa.OpAMD64VPERMI2PD512load,
- ssa.OpAMD64VPERMI2Q512load,
ssa.OpAMD64VPSHLDVD128load,
ssa.OpAMD64VPSHLDVD256load,
ssa.OpAMD64VPSHLDVD512load,
case ssa.OpAMD64VPDPWSSDMasked128load,
ssa.OpAMD64VPDPWSSDMasked256load,
ssa.OpAMD64VPDPWSSDMasked512load,
+ ssa.OpAMD64VPERMI2PSMasked128load,
+ ssa.OpAMD64VPERMI2DMasked128load,
+ ssa.OpAMD64VPERMI2PSMasked256load,
+ ssa.OpAMD64VPERMI2DMasked256load,
+ ssa.OpAMD64VPERMI2PSMasked512load,
+ ssa.OpAMD64VPERMI2DMasked512load,
+ ssa.OpAMD64VPERMI2PDMasked128load,
+ ssa.OpAMD64VPERMI2QMasked128load,
+ ssa.OpAMD64VPERMI2PDMasked256load,
+ ssa.OpAMD64VPERMI2QMasked256load,
+ ssa.OpAMD64VPERMI2PDMasked512load,
+ ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPDPBUSDMasked128load,
ssa.OpAMD64VPDPBUSDMasked256load,
ssa.OpAMD64VPDPBUSDMasked512load,
ssa.OpAMD64VFMSUBADD213PDMasked128load,
ssa.OpAMD64VFMSUBADD213PDMasked256load,
ssa.OpAMD64VFMSUBADD213PDMasked512load,
- ssa.OpAMD64VPERMI2PSMasked128load,
- ssa.OpAMD64VPERMI2DMasked128load,
- ssa.OpAMD64VPERMI2PSMasked256load,
- ssa.OpAMD64VPERMI2DMasked256load,
- ssa.OpAMD64VPERMI2PSMasked512load,
- ssa.OpAMD64VPERMI2DMasked512load,
- ssa.OpAMD64VPERMI2PDMasked128load,
- ssa.OpAMD64VPERMI2QMasked128load,
- ssa.OpAMD64VPERMI2PDMasked256load,
- ssa.OpAMD64VPERMI2QMasked256load,
- ssa.OpAMD64VPERMI2PDMasked512load,
- ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPSHLDVDMasked128load,
ssa.OpAMD64VPSHLDVDMasked256load,
ssa.OpAMD64VPSHLDVDMasked512load,
ssa.OpAMD64VREDUCEPD128load,
ssa.OpAMD64VREDUCEPD256load,
ssa.OpAMD64VREDUCEPD512load,
- ssa.OpAMD64VPSHUFD512load,
ssa.OpAMD64VPROLD128load,
ssa.OpAMD64VPROLD256load,
ssa.OpAMD64VPROLD512load,
ssa.OpAMD64VPRORQ128load,
ssa.OpAMD64VPRORQ256load,
ssa.OpAMD64VPRORQ512load,
+ ssa.OpAMD64VPSHUFD512load,
ssa.OpAMD64VPSLLD512constload,
ssa.OpAMD64VPSLLQ512constload,
ssa.OpAMD64VPSRLD512constload,
ssa.OpAMD64VREDUCEPDMasked128load,
ssa.OpAMD64VREDUCEPDMasked256load,
ssa.OpAMD64VREDUCEPDMasked512load,
- ssa.OpAMD64VPSHUFDMasked256load,
- ssa.OpAMD64VPSHUFDMasked512load,
- ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPROLDMasked128load,
ssa.OpAMD64VPROLDMasked256load,
ssa.OpAMD64VPROLDMasked512load,
ssa.OpAMD64VPRORQMasked128load,
ssa.OpAMD64VPRORQMasked256load,
ssa.OpAMD64VPRORQMasked512load,
+ ssa.OpAMD64VPSHUFDMasked256load,
+ ssa.OpAMD64VPSHUFDMasked512load,
+ ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPSLLDMasked128constload,
ssa.OpAMD64VPSLLDMasked256constload,
ssa.OpAMD64VPSLLDMasked512constload,
ssa.OpAMD64VPOPCNTQMasked128Merging,
ssa.OpAMD64VPOPCNTQMasked256Merging,
ssa.OpAMD64VPOPCNTQMasked512Merging,
- ssa.OpAMD64VPSHUFDMasked256Merging,
- ssa.OpAMD64VPSHUFDMasked512Merging,
- ssa.OpAMD64VPSHUFHWMasked256Merging,
- ssa.OpAMD64VPSHUFHWMasked512Merging,
- ssa.OpAMD64VPSHUFHWMasked128Merging,
- ssa.OpAMD64VPSHUFDMasked128Merging,
ssa.OpAMD64VRCP14PSMasked128Merging,
ssa.OpAMD64VRCP14PSMasked256Merging,
ssa.OpAMD64VRCP14PSMasked512Merging,
ssa.OpAMD64VSQRTPDMasked128Merging,
ssa.OpAMD64VSQRTPDMasked256Merging,
ssa.OpAMD64VSQRTPDMasked512Merging,
+ ssa.OpAMD64VPSHUFDMasked256Merging,
+ ssa.OpAMD64VPSHUFDMasked512Merging,
+ ssa.OpAMD64VPSHUFHWMasked256Merging,
+ ssa.OpAMD64VPSHUFHWMasked512Merging,
+ ssa.OpAMD64VPSHUFHWMasked128Merging,
+ ssa.OpAMD64VPSHUFLWMasked256Merging,
+ ssa.OpAMD64VPSHUFLWMasked512Merging,
+ ssa.OpAMD64VPSHUFLWMasked128Merging,
+ ssa.OpAMD64VPSHUFDMasked128Merging,
ssa.OpAMD64VPSLLWMasked128constMerging,
ssa.OpAMD64VPSLLWMasked256constMerging,
ssa.OpAMD64VPSLLWMasked512constMerging,
ssa.OpAMD64VPCOMPRESSQMasked128,
ssa.OpAMD64VPCOMPRESSQMasked256,
ssa.OpAMD64VPCOMPRESSQMasked512,
+ ssa.OpAMD64VPERMI2BMasked128,
+ ssa.OpAMD64VPERMI2BMasked256,
+ ssa.OpAMD64VPERMI2BMasked512,
+ ssa.OpAMD64VPERMI2WMasked128,
+ ssa.OpAMD64VPERMI2WMasked256,
+ ssa.OpAMD64VPERMI2WMasked512,
+ ssa.OpAMD64VPERMI2PSMasked128,
+ ssa.OpAMD64VPERMI2PSMasked128load,
+ ssa.OpAMD64VPERMI2DMasked128,
+ ssa.OpAMD64VPERMI2DMasked128load,
+ ssa.OpAMD64VPERMI2PSMasked256,
+ ssa.OpAMD64VPERMI2PSMasked256load,
+ ssa.OpAMD64VPERMI2DMasked256,
+ ssa.OpAMD64VPERMI2DMasked256load,
+ ssa.OpAMD64VPERMI2PSMasked512,
+ ssa.OpAMD64VPERMI2PSMasked512load,
+ ssa.OpAMD64VPERMI2DMasked512,
+ ssa.OpAMD64VPERMI2DMasked512load,
+ ssa.OpAMD64VPERMI2PDMasked128,
+ ssa.OpAMD64VPERMI2PDMasked128load,
+ ssa.OpAMD64VPERMI2QMasked128,
+ ssa.OpAMD64VPERMI2QMasked128load,
+ ssa.OpAMD64VPERMI2PDMasked256,
+ ssa.OpAMD64VPERMI2PDMasked256load,
+ ssa.OpAMD64VPERMI2QMasked256,
+ ssa.OpAMD64VPERMI2QMasked256load,
+ ssa.OpAMD64VPERMI2PDMasked512,
+ ssa.OpAMD64VPERMI2PDMasked512load,
+ ssa.OpAMD64VPERMI2QMasked512,
+ ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPALIGNRMasked256,
ssa.OpAMD64VPALIGNRMasked512,
ssa.OpAMD64VPALIGNRMasked128,
ssa.OpAMD64VPORQMasked256load,
ssa.OpAMD64VPORQMasked512,
ssa.OpAMD64VPORQMasked512load,
- ssa.OpAMD64VPERMI2BMasked128,
- ssa.OpAMD64VPERMI2BMasked256,
- ssa.OpAMD64VPERMI2BMasked512,
- ssa.OpAMD64VPERMI2WMasked128,
- ssa.OpAMD64VPERMI2WMasked256,
- ssa.OpAMD64VPERMI2WMasked512,
- ssa.OpAMD64VPERMI2PSMasked128,
- ssa.OpAMD64VPERMI2PSMasked128load,
- ssa.OpAMD64VPERMI2DMasked128,
- ssa.OpAMD64VPERMI2DMasked128load,
- ssa.OpAMD64VPERMI2PSMasked256,
- ssa.OpAMD64VPERMI2PSMasked256load,
- ssa.OpAMD64VPERMI2DMasked256,
- ssa.OpAMD64VPERMI2DMasked256load,
- ssa.OpAMD64VPERMI2PSMasked512,
- ssa.OpAMD64VPERMI2PSMasked512load,
- ssa.OpAMD64VPERMI2DMasked512,
- ssa.OpAMD64VPERMI2DMasked512load,
- ssa.OpAMD64VPERMI2PDMasked128,
- ssa.OpAMD64VPERMI2PDMasked128load,
- ssa.OpAMD64VPERMI2QMasked128,
- ssa.OpAMD64VPERMI2QMasked128load,
- ssa.OpAMD64VPERMI2PDMasked256,
- ssa.OpAMD64VPERMI2PDMasked256load,
- ssa.OpAMD64VPERMI2QMasked256,
- ssa.OpAMD64VPERMI2QMasked256load,
- ssa.OpAMD64VPERMI2PDMasked512,
- ssa.OpAMD64VPERMI2PDMasked512load,
- ssa.OpAMD64VPERMI2QMasked512,
- ssa.OpAMD64VPERMI2QMasked512load,
- ssa.OpAMD64VPSHUFDMasked256,
- ssa.OpAMD64VPSHUFDMasked256load,
- ssa.OpAMD64VPSHUFDMasked512,
- ssa.OpAMD64VPSHUFDMasked512load,
- ssa.OpAMD64VPSHUFHWMasked256,
- ssa.OpAMD64VPSHUFHWMasked512,
- ssa.OpAMD64VPSHUFHWMasked128,
- ssa.OpAMD64VPSHUFDMasked128,
- ssa.OpAMD64VPSHUFDMasked128load,
- ssa.OpAMD64VPSHUFBMasked256,
- ssa.OpAMD64VPSHUFBMasked512,
- ssa.OpAMD64VPSHUFBMasked128,
+ ssa.OpAMD64VPERMBMasked128,
ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512,
ssa.OpAMD64VPERMWMasked128,
ssa.OpAMD64VPERMPDMasked512load,
ssa.OpAMD64VPERMQMasked512,
ssa.OpAMD64VPERMQMasked512load,
+ ssa.OpAMD64VPSHUFBMasked256,
+ ssa.OpAMD64VPSHUFBMasked512,
+ ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VRCP14PSMasked128,
ssa.OpAMD64VRCP14PSMasked128load,
ssa.OpAMD64VRCP14PSMasked256,
ssa.OpAMD64VMOVDQU64Masked128,
ssa.OpAMD64VMOVDQU64Masked256,
ssa.OpAMD64VMOVDQU64Masked512,
+ ssa.OpAMD64VPSHUFDMasked256,
+ ssa.OpAMD64VPSHUFDMasked256load,
+ ssa.OpAMD64VPSHUFDMasked512,
+ ssa.OpAMD64VPSHUFDMasked512load,
+ ssa.OpAMD64VPSHUFHWMasked256,
+ ssa.OpAMD64VPSHUFHWMasked512,
+ ssa.OpAMD64VPSHUFHWMasked128,
+ ssa.OpAMD64VPSHUFLWMasked256,
+ ssa.OpAMD64VPSHUFLWMasked512,
+ ssa.OpAMD64VPSHUFLWMasked128,
+ ssa.OpAMD64VPSHUFDMasked128,
+ ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPSLLWMasked128const,
ssa.OpAMD64VPSLLWMasked256const,
ssa.OpAMD64VPSLLWMasked512const,
(CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
(CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
(CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
+(ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
+(ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
+(ConcatPermuteFloat64x2 ...) => (VPERMI2PD128 ...)
+(ConcatPermuteFloat64x4 ...) => (VPERMI2PD256 ...)
+(ConcatPermuteFloat64x8 ...) => (VPERMI2PD512 ...)
+(ConcatPermuteInt8x16 ...) => (VPERMI2B128 ...)
+(ConcatPermuteInt8x32 ...) => (VPERMI2B256 ...)
+(ConcatPermuteInt8x64 ...) => (VPERMI2B512 ...)
+(ConcatPermuteInt16x8 ...) => (VPERMI2W128 ...)
+(ConcatPermuteInt16x16 ...) => (VPERMI2W256 ...)
+(ConcatPermuteInt16x32 ...) => (VPERMI2W512 ...)
+(ConcatPermuteInt32x4 ...) => (VPERMI2D128 ...)
+(ConcatPermuteInt32x8 ...) => (VPERMI2D256 ...)
+(ConcatPermuteInt32x16 ...) => (VPERMI2D512 ...)
+(ConcatPermuteInt64x2 ...) => (VPERMI2Q128 ...)
+(ConcatPermuteInt64x4 ...) => (VPERMI2Q256 ...)
+(ConcatPermuteInt64x8 ...) => (VPERMI2Q512 ...)
+(ConcatPermuteUint8x16 ...) => (VPERMI2B128 ...)
+(ConcatPermuteUint8x32 ...) => (VPERMI2B256 ...)
+(ConcatPermuteUint8x64 ...) => (VPERMI2B512 ...)
+(ConcatPermuteUint16x8 ...) => (VPERMI2W128 ...)
+(ConcatPermuteUint16x16 ...) => (VPERMI2W256 ...)
+(ConcatPermuteUint16x32 ...) => (VPERMI2W512 ...)
+(ConcatPermuteUint32x4 ...) => (VPERMI2D128 ...)
+(ConcatPermuteUint32x8 ...) => (VPERMI2D256 ...)
+(ConcatPermuteUint32x16 ...) => (VPERMI2D512 ...)
+(ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
+(ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
+(ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
(ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
(ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
(ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
(PermuteFloat32x16 ...) => (VPERMPS512 ...)
(PermuteFloat64x4 ...) => (VPERMPD256 ...)
(PermuteFloat64x8 ...) => (VPERMPD512 ...)
-(PermuteInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteInt8x16 ...) => (VPERMB128 ...)
(PermuteInt8x32 ...) => (VPERMB256 ...)
(PermuteInt8x64 ...) => (VPERMB512 ...)
(PermuteInt16x8 ...) => (VPERMW128 ...)
(PermuteInt32x16 ...) => (VPERMD512 ...)
(PermuteInt64x4 ...) => (VPERMQ256 ...)
(PermuteInt64x8 ...) => (VPERMQ512 ...)
-(PermuteUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteUint8x16 ...) => (VPERMB128 ...)
(PermuteUint8x32 ...) => (VPERMB256 ...)
(PermuteUint8x64 ...) => (VPERMB512 ...)
(PermuteUint16x8 ...) => (VPERMW128 ...)
(PermuteUint32x16 ...) => (VPERMD512 ...)
(PermuteUint64x4 ...) => (VPERMQ256 ...)
(PermuteUint64x8 ...) => (VPERMQ512 ...)
-(Permute2Float32x4 ...) => (VPERMI2PS128 ...)
-(Permute2Float32x8 ...) => (VPERMI2PS256 ...)
-(Permute2Float32x16 ...) => (VPERMI2PS512 ...)
-(Permute2Float64x2 ...) => (VPERMI2PD128 ...)
-(Permute2Float64x4 ...) => (VPERMI2PD256 ...)
-(Permute2Float64x8 ...) => (VPERMI2PD512 ...)
-(Permute2Int8x16 ...) => (VPERMI2B128 ...)
-(Permute2Int8x32 ...) => (VPERMI2B256 ...)
-(Permute2Int8x64 ...) => (VPERMI2B512 ...)
-(Permute2Int16x8 ...) => (VPERMI2W128 ...)
-(Permute2Int16x16 ...) => (VPERMI2W256 ...)
-(Permute2Int16x32 ...) => (VPERMI2W512 ...)
-(Permute2Int32x4 ...) => (VPERMI2D128 ...)
-(Permute2Int32x8 ...) => (VPERMI2D256 ...)
-(Permute2Int32x16 ...) => (VPERMI2D512 ...)
-(Permute2Int64x2 ...) => (VPERMI2Q128 ...)
-(Permute2Int64x4 ...) => (VPERMI2Q256 ...)
-(Permute2Int64x8 ...) => (VPERMI2Q512 ...)
-(Permute2Uint8x16 ...) => (VPERMI2B128 ...)
-(Permute2Uint8x32 ...) => (VPERMI2B256 ...)
-(Permute2Uint8x64 ...) => (VPERMI2B512 ...)
-(Permute2Uint16x8 ...) => (VPERMI2W128 ...)
-(Permute2Uint16x16 ...) => (VPERMI2W256 ...)
-(Permute2Uint16x32 ...) => (VPERMI2W512 ...)
-(Permute2Uint32x4 ...) => (VPERMI2D128 ...)
-(Permute2Uint32x8 ...) => (VPERMI2D256 ...)
-(Permute2Uint32x16 ...) => (VPERMI2D512 ...)
-(Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
-(Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
-(Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
-(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
-(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
-(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
-(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
-(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
-(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
-(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
-(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
-(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
-(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
-(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
+(PermuteOrZeroInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteOrZeroUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteOrZeroGroupedInt8x32 ...) => (VPSHUFB256 ...)
+(PermuteOrZeroGroupedInt8x64 ...) => (VPSHUFB512 ...)
+(PermuteOrZeroGroupedUint8x32 ...) => (VPSHUFB256 ...)
+(PermuteOrZeroGroupedUint8x64 ...) => (VPSHUFB512 ...)
(ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
(ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
(ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
(concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...)
(concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...)
(concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...)
+(permuteScalarsInt32x4 ...) => (VPSHUFD128 ...)
+(permuteScalarsUint32x4 ...) => (VPSHUFD128 ...)
+(permuteScalarsGroupedInt32x8 ...) => (VPSHUFD256 ...)
+(permuteScalarsGroupedInt32x16 ...) => (VPSHUFD512 ...)
+(permuteScalarsGroupedUint32x8 ...) => (VPSHUFD256 ...)
+(permuteScalarsGroupedUint32x16 ...) => (VPSHUFD512 ...)
+(permuteScalarsHiInt16x8 ...) => (VPSHUFHW128 ...)
+(permuteScalarsHiUint16x8 ...) => (VPSHUFHW128 ...)
+(permuteScalarsHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
+(permuteScalarsHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
+(permuteScalarsHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
+(permuteScalarsHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
+(permuteScalarsLoInt16x8 ...) => (VPSHUFLW128 ...)
+(permuteScalarsLoUint16x8 ...) => (VPSHUFLW128 ...)
+(permuteScalarsLoGroupedInt16x16 ...) => (VPSHUFLW256 ...)
+(permuteScalarsLoGroupedInt16x32 ...) => (VPSHUFLW512 ...)
+(permuteScalarsLoGroupedUint16x16 ...) => (VPSHUFLW256 ...)
+(permuteScalarsLoGroupedUint16x32 ...) => (VPSHUFLW512 ...)
(ternInt32x4 ...) => (VPTERNLOGD128 ...)
(ternInt32x8 ...) => (VPTERNLOGD256 ...)
(ternInt32x16 ...) => (VPTERNLOGD512 ...)
(VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask)
(VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask)
(VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
+(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
+(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
+(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
+(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
+(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
+(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
+(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
+(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
+(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
+(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
+(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
+(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
+(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
+(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
+(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
+(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
+(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
+(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
(VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask)
(VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask)
(VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask)
(VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask)
(VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask)
(VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask)
-(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
-(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
-(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
-(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
-(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
-(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
-(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
-(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
-(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
-(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
-(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
-(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
-(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
-(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
-(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
-(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
-(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
-(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
-(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
-(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
-(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
-(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
-(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
-(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
-(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
-(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
-(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
+(VMOVDQU8Masked128 (VPERMB128 x y) mask) => (VPERMBMasked128 x y mask)
(VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask)
(VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
(VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask)
(VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask)
(VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask)
(VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask)
+(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
+(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
+(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
(VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask)
(VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask)
(VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask)
(VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask)
(VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask)
(VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask)
+(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
+(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
+(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
+(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
+(VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) => (VPSHUFLWMasked256 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512 [a] x mask)
+(VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) => (VPSHUFLWMasked128 [a] x mask)
+(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
(VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask)
(VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask)
(VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask)
(VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask)
(VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask)
(VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask)
+(VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask)
(VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask)
(VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask))
(VREDUCEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VREDUCEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VREDUCEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
+(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
+(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
+(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
+(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
+(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
+(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
+(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
+(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
+(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
+(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
+(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
+(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
(VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
(VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
(VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem)
(VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem)
(VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem)
-(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
-(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
-(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
-(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
-(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
-(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
-(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
-(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
-(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
-(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
-(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
-(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
-(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
-(VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
-(VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
-(VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
-(VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem)
(VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem)
(VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem)
(VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
(VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
(VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+(VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+(VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+(VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+(VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
{name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
+ {name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPSHUFLW128", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPSHUFLW256", argLength: 1, reg: v11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPSHUFLW512", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPSHUFLWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPSHUFLWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPSHUFLWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
+ {name: "VPSHUFLWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
+ {name: "VPSHUFLWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
+ {name: "VPSHUFLWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "CompressUint64x2", argLength: 2, commutative: false},
{name: "CompressUint64x4", argLength: 2, commutative: false},
{name: "CompressUint64x8", argLength: 2, commutative: false},
+ {name: "ConcatPermuteFloat32x4", argLength: 3, commutative: false},
+ {name: "ConcatPermuteFloat32x8", argLength: 3, commutative: false},
+ {name: "ConcatPermuteFloat32x16", argLength: 3, commutative: false},
+ {name: "ConcatPermuteFloat64x2", argLength: 3, commutative: false},
+ {name: "ConcatPermuteFloat64x4", argLength: 3, commutative: false},
+ {name: "ConcatPermuteFloat64x8", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt8x16", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt8x32", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt8x64", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt16x8", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt16x16", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt16x32", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt32x4", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt32x8", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt32x16", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt64x2", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt64x4", argLength: 3, commutative: false},
+ {name: "ConcatPermuteInt64x8", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint8x16", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint8x32", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint8x64", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint16x8", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint16x16", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint16x32", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint32x4", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint32x8", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint32x16", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint64x2", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint64x4", argLength: 3, commutative: false},
+ {name: "ConcatPermuteUint64x8", argLength: 3, commutative: false},
{name: "ConvertToInt8Int16x8", argLength: 1, commutative: false},
{name: "ConvertToInt8Int16x16", argLength: 1, commutative: false},
{name: "ConvertToInt8Int16x32", argLength: 1, commutative: false},
{name: "OrUint64x2", argLength: 2, commutative: true},
{name: "OrUint64x4", argLength: 2, commutative: true},
{name: "OrUint64x8", argLength: 2, commutative: true},
- {name: "Permute2Float32x4", argLength: 3, commutative: false},
- {name: "Permute2Float32x8", argLength: 3, commutative: false},
- {name: "Permute2Float32x16", argLength: 3, commutative: false},
- {name: "Permute2Float64x2", argLength: 3, commutative: false},
- {name: "Permute2Float64x4", argLength: 3, commutative: false},
- {name: "Permute2Float64x8", argLength: 3, commutative: false},
- {name: "Permute2Int8x16", argLength: 3, commutative: false},
- {name: "Permute2Int8x32", argLength: 3, commutative: false},
- {name: "Permute2Int8x64", argLength: 3, commutative: false},
- {name: "Permute2Int16x8", argLength: 3, commutative: false},
- {name: "Permute2Int16x16", argLength: 3, commutative: false},
- {name: "Permute2Int16x32", argLength: 3, commutative: false},
- {name: "Permute2Int32x4", argLength: 3, commutative: false},
- {name: "Permute2Int32x8", argLength: 3, commutative: false},
- {name: "Permute2Int32x16", argLength: 3, commutative: false},
- {name: "Permute2Int64x2", argLength: 3, commutative: false},
- {name: "Permute2Int64x4", argLength: 3, commutative: false},
- {name: "Permute2Int64x8", argLength: 3, commutative: false},
- {name: "Permute2Uint8x16", argLength: 3, commutative: false},
- {name: "Permute2Uint8x32", argLength: 3, commutative: false},
- {name: "Permute2Uint8x64", argLength: 3, commutative: false},
- {name: "Permute2Uint16x8", argLength: 3, commutative: false},
- {name: "Permute2Uint16x16", argLength: 3, commutative: false},
- {name: "Permute2Uint16x32", argLength: 3, commutative: false},
- {name: "Permute2Uint32x4", argLength: 3, commutative: false},
- {name: "Permute2Uint32x8", argLength: 3, commutative: false},
- {name: "Permute2Uint32x16", argLength: 3, commutative: false},
- {name: "Permute2Uint64x2", argLength: 3, commutative: false},
- {name: "Permute2Uint64x4", argLength: 3, commutative: false},
- {name: "Permute2Uint64x8", argLength: 3, commutative: false},
{name: "PermuteFloat32x8", argLength: 2, commutative: false},
{name: "PermuteFloat32x16", argLength: 2, commutative: false},
{name: "PermuteFloat64x4", argLength: 2, commutative: false},
{name: "PermuteFloat64x8", argLength: 2, commutative: false},
- {name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
- {name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
- {name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
- {name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
{name: "PermuteInt8x16", argLength: 2, commutative: false},
{name: "PermuteInt8x32", argLength: 2, commutative: false},
{name: "PermuteInt8x64", argLength: 2, commutative: false},
{name: "PermuteInt32x16", argLength: 2, commutative: false},
{name: "PermuteInt64x4", argLength: 2, commutative: false},
{name: "PermuteInt64x8", argLength: 2, commutative: false},
+ {name: "PermuteOrZeroGroupedInt8x32", argLength: 2, commutative: false},
+ {name: "PermuteOrZeroGroupedInt8x64", argLength: 2, commutative: false},
+ {name: "PermuteOrZeroGroupedUint8x32", argLength: 2, commutative: false},
+ {name: "PermuteOrZeroGroupedUint8x64", argLength: 2, commutative: false},
+ {name: "PermuteOrZeroInt8x16", argLength: 2, commutative: false},
+ {name: "PermuteOrZeroUint8x16", argLength: 2, commutative: false},
{name: "PermuteUint8x16", argLength: 2, commutative: false},
{name: "PermuteUint8x32", argLength: 2, commutative: false},
{name: "PermuteUint8x64", argLength: 2, commutative: false},
{name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
- {name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "permuteScalarsUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"},
{name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"},
{name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"},
OpAMD64VPDPWSSDMasked128
OpAMD64VPDPWSSDMasked256
OpAMD64VPDPWSSDMasked512
+ OpAMD64VPERMB128
OpAMD64VPERMB256
OpAMD64VPERMB512
+ OpAMD64VPERMBMasked128
OpAMD64VPERMBMasked256
OpAMD64VPERMBMasked512
OpAMD64VPERMD256
OpAMD64VPSHUFHWMasked128
OpAMD64VPSHUFHWMasked256
OpAMD64VPSHUFHWMasked512
+ OpAMD64VPSHUFLW128
+ OpAMD64VPSHUFLW256
+ OpAMD64VPSHUFLW512
+ OpAMD64VPSHUFLWMasked128
+ OpAMD64VPSHUFLWMasked256
+ OpAMD64VPSHUFLWMasked512
OpAMD64VPSLLD128const
OpAMD64VPSLLD256const
OpAMD64VPSLLD512const
OpAMD64VPSHUFHWMasked128Merging
OpAMD64VPSHUFHWMasked256Merging
OpAMD64VPSHUFHWMasked512Merging
+ OpAMD64VPSHUFLWMasked128Merging
+ OpAMD64VPSHUFLWMasked256Merging
+ OpAMD64VPSHUFLWMasked512Merging
OpAMD64VPSLLDMasked128constMerging
OpAMD64VPSLLDMasked256constMerging
OpAMD64VPSLLDMasked512constMerging
OpCompressUint64x2
OpCompressUint64x4
OpCompressUint64x8
+ OpConcatPermuteFloat32x4
+ OpConcatPermuteFloat32x8
+ OpConcatPermuteFloat32x16
+ OpConcatPermuteFloat64x2
+ OpConcatPermuteFloat64x4
+ OpConcatPermuteFloat64x8
+ OpConcatPermuteInt8x16
+ OpConcatPermuteInt8x32
+ OpConcatPermuteInt8x64
+ OpConcatPermuteInt16x8
+ OpConcatPermuteInt16x16
+ OpConcatPermuteInt16x32
+ OpConcatPermuteInt32x4
+ OpConcatPermuteInt32x8
+ OpConcatPermuteInt32x16
+ OpConcatPermuteInt64x2
+ OpConcatPermuteInt64x4
+ OpConcatPermuteInt64x8
+ OpConcatPermuteUint8x16
+ OpConcatPermuteUint8x32
+ OpConcatPermuteUint8x64
+ OpConcatPermuteUint16x8
+ OpConcatPermuteUint16x16
+ OpConcatPermuteUint16x32
+ OpConcatPermuteUint32x4
+ OpConcatPermuteUint32x8
+ OpConcatPermuteUint32x16
+ OpConcatPermuteUint64x2
+ OpConcatPermuteUint64x4
+ OpConcatPermuteUint64x8
OpConvertToInt8Int16x8
OpConvertToInt8Int16x16
OpConvertToInt8Int16x32
OpOrUint64x2
OpOrUint64x4
OpOrUint64x8
- OpPermute2Float32x4
- OpPermute2Float32x8
- OpPermute2Float32x16
- OpPermute2Float64x2
- OpPermute2Float64x4
- OpPermute2Float64x8
- OpPermute2Int8x16
- OpPermute2Int8x32
- OpPermute2Int8x64
- OpPermute2Int16x8
- OpPermute2Int16x16
- OpPermute2Int16x32
- OpPermute2Int32x4
- OpPermute2Int32x8
- OpPermute2Int32x16
- OpPermute2Int64x2
- OpPermute2Int64x4
- OpPermute2Int64x8
- OpPermute2Uint8x16
- OpPermute2Uint8x32
- OpPermute2Uint8x64
- OpPermute2Uint16x8
- OpPermute2Uint16x16
- OpPermute2Uint16x32
- OpPermute2Uint32x4
- OpPermute2Uint32x8
- OpPermute2Uint32x16
- OpPermute2Uint64x2
- OpPermute2Uint64x4
- OpPermute2Uint64x8
OpPermuteFloat32x8
OpPermuteFloat32x16
OpPermuteFloat64x4
OpPermuteFloat64x8
- OpPermuteGroupedInt8x32
- OpPermuteGroupedInt8x64
- OpPermuteGroupedUint8x32
- OpPermuteGroupedUint8x64
OpPermuteInt8x16
OpPermuteInt8x32
OpPermuteInt8x64
OpPermuteInt32x16
OpPermuteInt64x4
OpPermuteInt64x8
+ OpPermuteOrZeroGroupedInt8x32
+ OpPermuteOrZeroGroupedInt8x64
+ OpPermuteOrZeroGroupedUint8x32
+ OpPermuteOrZeroGroupedUint8x64
+ OpPermuteOrZeroInt8x16
+ OpPermuteOrZeroUint8x16
OpPermuteUint8x16
OpPermuteUint8x32
OpPermuteUint8x64
OpGetElemUint16x8
OpGetElemUint32x4
OpGetElemUint64x2
- OpPermuteConstantGroupedInt32x8
- OpPermuteConstantGroupedInt32x16
- OpPermuteConstantGroupedUint32x8
- OpPermuteConstantGroupedUint32x16
- OpPermuteConstantHiGroupedInt16x16
- OpPermuteConstantHiGroupedInt16x32
- OpPermuteConstantHiGroupedUint16x16
- OpPermuteConstantHiGroupedUint16x32
- OpPermuteConstantHiInt16x8
- OpPermuteConstantHiInt32x4
- OpPermuteConstantHiUint16x8
- OpPermuteConstantHiUint32x4
- OpPermuteConstantInt32x4
- OpPermuteConstantLoGroupedInt16x16
- OpPermuteConstantLoGroupedInt16x32
- OpPermuteConstantLoGroupedUint16x16
- OpPermuteConstantLoGroupedUint16x32
- OpPermuteConstantLoInt16x8
- OpPermuteConstantLoInt32x4
- OpPermuteConstantLoUint16x8
- OpPermuteConstantLoUint32x4
- OpPermuteConstantUint32x4
OpRotateAllLeftInt32x4
OpRotateAllLeftInt32x8
OpRotateAllLeftInt32x16
OpconcatSelectedConstantInt64x2
OpconcatSelectedConstantUint32x4
OpconcatSelectedConstantUint64x2
+ OppermuteScalarsGroupedInt32x8
+ OppermuteScalarsGroupedInt32x16
+ OppermuteScalarsGroupedUint32x8
+ OppermuteScalarsGroupedUint32x16
+ OppermuteScalarsHiGroupedInt16x16
+ OppermuteScalarsHiGroupedInt16x32
+ OppermuteScalarsHiGroupedUint16x16
+ OppermuteScalarsHiGroupedUint16x32
+ OppermuteScalarsHiInt16x8
+ OppermuteScalarsHiUint16x8
+ OppermuteScalarsInt32x4
+ OppermuteScalarsLoGroupedInt16x16
+ OppermuteScalarsLoGroupedInt16x32
+ OppermuteScalarsLoGroupedUint16x16
+ OppermuteScalarsLoGroupedUint16x32
+ OppermuteScalarsLoInt16x8
+ OppermuteScalarsLoUint16x8
+ OppermuteScalarsUint32x4
OpternInt32x4
OpternInt32x8
OpternInt32x16
},
},
},
+ {
+ name: "VPERMB128",
+ argLen: 2,
+ asm: x86.AVPERMB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
{
name: "VPERMB256",
argLen: 2,
},
},
},
+ {
+ name: "VPERMBMasked128",
+ argLen: 3,
+ asm: x86.AVPERMB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
{
name: "VPERMBMasked256",
argLen: 3,
},
},
},
+ {
+ name: "VPSHUFLW128",
+ auxType: auxUInt8,
+ argLen: 1,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPSHUFLW256",
+ auxType: auxUInt8,
+ argLen: 1,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VPSHUFLW512",
+ auxType: auxUInt8,
+ argLen: 1,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPSHUFLWMasked128",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPSHUFLWMasked256",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPSHUFLWMasked512",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
{
name: "VPSLLD128const",
auxType: auxUInt8,
},
},
},
+ {
+ name: "VPSHUFLWMasked128Merging",
+ auxType: auxUInt8,
+ argLen: 3,
+ resultInArg0: true,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPSHUFLWMasked256Merging",
+ auxType: auxUInt8,
+ argLen: 3,
+ resultInArg0: true,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPSHUFLWMasked512Merging",
+ auxType: auxUInt8,
+ argLen: 3,
+ resultInArg0: true,
+ asm: x86.AVPSHUFLW,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
{
name: "VPSLLDMasked128constMerging",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
+ {
+ name: "ConcatPermuteFloat32x4",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteFloat32x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteFloat32x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteFloat64x2",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteFloat64x4",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteFloat64x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt8x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt8x32",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt8x64",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt16x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt16x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt16x32",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt32x4",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt32x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt32x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt64x2",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt64x4",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteInt64x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint8x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint8x32",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint8x64",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint16x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint16x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint16x32",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint32x4",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint32x8",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint32x16",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint64x2",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint64x4",
+ argLen: 3,
+ generic: true,
+ },
+ {
+ name: "ConcatPermuteUint64x8",
+ argLen: 3,
+ generic: true,
+ },
{
name: "ConvertToInt8Int16x8",
argLen: 1,
generic: true,
},
{
- name: "Permute2Float32x4",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Float32x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Float32x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Float64x2",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Float64x4",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Float64x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int8x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int8x32",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int8x64",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int16x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int16x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int16x32",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int32x4",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int32x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int32x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int64x2",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int64x4",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Int64x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint8x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint8x32",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint8x64",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint16x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint16x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint16x32",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint32x4",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint32x8",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint32x16",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint64x2",
- argLen: 3,
- generic: true,
- },
- {
- name: "Permute2Uint64x4",
- argLen: 3,
+ name: "PermuteFloat32x8",
+ argLen: 2,
generic: true,
},
{
- name: "Permute2Uint64x8",
- argLen: 3,
+ name: "PermuteFloat32x16",
+ argLen: 2,
generic: true,
},
{
- name: "PermuteFloat32x8",
+ name: "PermuteFloat64x4",
argLen: 2,
generic: true,
},
{
- name: "PermuteFloat32x16",
+ name: "PermuteFloat64x8",
argLen: 2,
generic: true,
},
{
- name: "PermuteFloat64x4",
+ name: "PermuteInt8x16",
argLen: 2,
generic: true,
},
{
- name: "PermuteFloat64x8",
+ name: "PermuteInt8x32",
argLen: 2,
generic: true,
},
{
- name: "PermuteGroupedInt8x32",
+ name: "PermuteInt8x64",
argLen: 2,
generic: true,
},
{
- name: "PermuteGroupedInt8x64",
+ name: "PermuteInt16x8",
argLen: 2,
generic: true,
},
{
- name: "PermuteGroupedUint8x32",
+ name: "PermuteInt16x16",
argLen: 2,
generic: true,
},
{
- name: "PermuteGroupedUint8x64",
+ name: "PermuteInt16x32",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt8x16",
+ name: "PermuteInt32x8",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt8x32",
+ name: "PermuteInt32x16",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt8x64",
+ name: "PermuteInt64x4",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt16x8",
+ name: "PermuteInt64x8",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt16x16",
+ name: "PermuteOrZeroGroupedInt8x32",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt16x32",
+ name: "PermuteOrZeroGroupedInt8x64",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt32x8",
+ name: "PermuteOrZeroGroupedUint8x32",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt32x16",
+ name: "PermuteOrZeroGroupedUint8x64",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt64x4",
+ name: "PermuteOrZeroInt8x16",
argLen: 2,
generic: true,
},
{
- name: "PermuteInt64x8",
+ name: "PermuteOrZeroUint8x16",
argLen: 2,
generic: true,
},
argLen: 1,
generic: true,
},
- {
- name: "PermuteConstantGroupedInt32x8",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantGroupedInt32x16",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantGroupedUint32x8",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantGroupedUint32x16",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantHiGroupedInt16x16",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantHiGroupedInt16x32",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantHiGroupedUint16x16",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantHiGroupedUint16x32",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantHiInt16x8",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantHiInt32x4",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantHiUint16x8",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantHiUint32x4",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantInt32x4",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantLoGroupedInt16x16",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantLoGroupedInt16x32",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantLoGroupedUint16x16",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantLoGroupedUint16x32",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantLoInt16x8",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantLoInt32x4",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantLoUint16x8",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantLoUint32x4",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
- {
- name: "PermuteConstantUint32x4",
- auxType: auxUInt8,
- argLen: 1,
- generic: true,
- },
{
name: "RotateAllLeftInt32x4",
auxType: auxUInt8,
argLen: 2,
generic: true,
},
+ {
+ name: "permuteScalarsGroupedInt32x8",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsGroupedInt32x16",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsGroupedUint32x8",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsGroupedUint32x16",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsHiGroupedInt16x16",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsHiGroupedInt16x32",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsHiGroupedUint16x16",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsHiGroupedUint16x32",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsHiInt16x8",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsHiUint16x8",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsInt32x4",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsLoGroupedInt16x16",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsLoGroupedInt16x32",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsLoGroupedUint16x16",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsLoGroupedUint16x32",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsLoInt16x8",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsLoUint16x8",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "permuteScalarsUint32x4",
+ auxType: auxUInt8,
+ argLen: 1,
+ generic: true,
+ },
{
name: "ternInt32x4",
auxType: auxUInt8,
return rewriteValueAMD64_OpCompressUint8x32(v)
case OpCompressUint8x64:
return rewriteValueAMD64_OpCompressUint8x64(v)
+ case OpConcatPermuteFloat32x16:
+ v.Op = OpAMD64VPERMI2PS512
+ return true
+ case OpConcatPermuteFloat32x4:
+ v.Op = OpAMD64VPERMI2PS128
+ return true
+ case OpConcatPermuteFloat32x8:
+ v.Op = OpAMD64VPERMI2PS256
+ return true
+ case OpConcatPermuteFloat64x2:
+ v.Op = OpAMD64VPERMI2PD128
+ return true
+ case OpConcatPermuteFloat64x4:
+ v.Op = OpAMD64VPERMI2PD256
+ return true
+ case OpConcatPermuteFloat64x8:
+ v.Op = OpAMD64VPERMI2PD512
+ return true
+ case OpConcatPermuteInt16x16:
+ v.Op = OpAMD64VPERMI2W256
+ return true
+ case OpConcatPermuteInt16x32:
+ v.Op = OpAMD64VPERMI2W512
+ return true
+ case OpConcatPermuteInt16x8:
+ v.Op = OpAMD64VPERMI2W128
+ return true
+ case OpConcatPermuteInt32x16:
+ v.Op = OpAMD64VPERMI2D512
+ return true
+ case OpConcatPermuteInt32x4:
+ v.Op = OpAMD64VPERMI2D128
+ return true
+ case OpConcatPermuteInt32x8:
+ v.Op = OpAMD64VPERMI2D256
+ return true
+ case OpConcatPermuteInt64x2:
+ v.Op = OpAMD64VPERMI2Q128
+ return true
+ case OpConcatPermuteInt64x4:
+ v.Op = OpAMD64VPERMI2Q256
+ return true
+ case OpConcatPermuteInt64x8:
+ v.Op = OpAMD64VPERMI2Q512
+ return true
+ case OpConcatPermuteInt8x16:
+ v.Op = OpAMD64VPERMI2B128
+ return true
+ case OpConcatPermuteInt8x32:
+ v.Op = OpAMD64VPERMI2B256
+ return true
+ case OpConcatPermuteInt8x64:
+ v.Op = OpAMD64VPERMI2B512
+ return true
+ case OpConcatPermuteUint16x16:
+ v.Op = OpAMD64VPERMI2W256
+ return true
+ case OpConcatPermuteUint16x32:
+ v.Op = OpAMD64VPERMI2W512
+ return true
+ case OpConcatPermuteUint16x8:
+ v.Op = OpAMD64VPERMI2W128
+ return true
+ case OpConcatPermuteUint32x16:
+ v.Op = OpAMD64VPERMI2D512
+ return true
+ case OpConcatPermuteUint32x4:
+ v.Op = OpAMD64VPERMI2D128
+ return true
+ case OpConcatPermuteUint32x8:
+ v.Op = OpAMD64VPERMI2D256
+ return true
+ case OpConcatPermuteUint64x2:
+ v.Op = OpAMD64VPERMI2Q128
+ return true
+ case OpConcatPermuteUint64x4:
+ v.Op = OpAMD64VPERMI2Q256
+ return true
+ case OpConcatPermuteUint64x8:
+ v.Op = OpAMD64VPERMI2Q512
+ return true
+ case OpConcatPermuteUint8x16:
+ v.Op = OpAMD64VPERMI2B128
+ return true
+ case OpConcatPermuteUint8x32:
+ v.Op = OpAMD64VPERMI2B256
+ return true
+ case OpConcatPermuteUint8x64:
+ v.Op = OpAMD64VPERMI2B512
+ return true
case OpConcatShiftBytesRightGroupedUint8x32:
v.Op = OpAMD64VPALIGNR256
return true
case OpPanicBounds:
v.Op = OpAMD64LoweredPanicBoundsRR
return true
- case OpPermute2Float32x16:
- v.Op = OpAMD64VPERMI2PS512
- return true
- case OpPermute2Float32x4:
- v.Op = OpAMD64VPERMI2PS128
- return true
- case OpPermute2Float32x8:
- v.Op = OpAMD64VPERMI2PS256
- return true
- case OpPermute2Float64x2:
- v.Op = OpAMD64VPERMI2PD128
- return true
- case OpPermute2Float64x4:
- v.Op = OpAMD64VPERMI2PD256
- return true
- case OpPermute2Float64x8:
- v.Op = OpAMD64VPERMI2PD512
- return true
- case OpPermute2Int16x16:
- v.Op = OpAMD64VPERMI2W256
- return true
- case OpPermute2Int16x32:
- v.Op = OpAMD64VPERMI2W512
- return true
- case OpPermute2Int16x8:
- v.Op = OpAMD64VPERMI2W128
- return true
- case OpPermute2Int32x16:
- v.Op = OpAMD64VPERMI2D512
- return true
- case OpPermute2Int32x4:
- v.Op = OpAMD64VPERMI2D128
- return true
- case OpPermute2Int32x8:
- v.Op = OpAMD64VPERMI2D256
- return true
- case OpPermute2Int64x2:
- v.Op = OpAMD64VPERMI2Q128
- return true
- case OpPermute2Int64x4:
- v.Op = OpAMD64VPERMI2Q256
- return true
- case OpPermute2Int64x8:
- v.Op = OpAMD64VPERMI2Q512
- return true
- case OpPermute2Int8x16:
- v.Op = OpAMD64VPERMI2B128
- return true
- case OpPermute2Int8x32:
- v.Op = OpAMD64VPERMI2B256
- return true
- case OpPermute2Int8x64:
- v.Op = OpAMD64VPERMI2B512
- return true
- case OpPermute2Uint16x16:
- v.Op = OpAMD64VPERMI2W256
- return true
- case OpPermute2Uint16x32:
- v.Op = OpAMD64VPERMI2W512
- return true
- case OpPermute2Uint16x8:
- v.Op = OpAMD64VPERMI2W128
- return true
- case OpPermute2Uint32x16:
- v.Op = OpAMD64VPERMI2D512
- return true
- case OpPermute2Uint32x4:
- v.Op = OpAMD64VPERMI2D128
- return true
- case OpPermute2Uint32x8:
- v.Op = OpAMD64VPERMI2D256
- return true
- case OpPermute2Uint64x2:
- v.Op = OpAMD64VPERMI2Q128
- return true
- case OpPermute2Uint64x4:
- v.Op = OpAMD64VPERMI2Q256
- return true
- case OpPermute2Uint64x8:
- v.Op = OpAMD64VPERMI2Q512
- return true
- case OpPermute2Uint8x16:
- v.Op = OpAMD64VPERMI2B128
- return true
- case OpPermute2Uint8x32:
- v.Op = OpAMD64VPERMI2B256
- return true
- case OpPermute2Uint8x64:
- v.Op = OpAMD64VPERMI2B512
- return true
- case OpPermuteConstantGroupedInt32x16:
- v.Op = OpAMD64VPSHUFD512
- return true
- case OpPermuteConstantGroupedInt32x8:
- v.Op = OpAMD64VPSHUFD256
- return true
- case OpPermuteConstantGroupedUint32x16:
- v.Op = OpAMD64VPSHUFD512
- return true
- case OpPermuteConstantGroupedUint32x8:
- v.Op = OpAMD64VPSHUFD256
- return true
- case OpPermuteConstantHiGroupedInt16x16:
- v.Op = OpAMD64VPSHUFHW256
- return true
- case OpPermuteConstantHiGroupedInt16x32:
- v.Op = OpAMD64VPSHUFHW512
- return true
- case OpPermuteConstantHiGroupedUint16x16:
- v.Op = OpAMD64VPSHUFHW256
- return true
- case OpPermuteConstantHiGroupedUint16x32:
- v.Op = OpAMD64VPSHUFHW512
- return true
- case OpPermuteConstantHiInt16x8:
- v.Op = OpAMD64VPSHUFHW128
- return true
- case OpPermuteConstantHiInt32x4:
- v.Op = OpAMD64VPSHUFHW128
- return true
- case OpPermuteConstantHiUint16x8:
- v.Op = OpAMD64VPSHUFHW128
- return true
- case OpPermuteConstantHiUint32x4:
- v.Op = OpAMD64VPSHUFHW128
- return true
- case OpPermuteConstantInt32x4:
- v.Op = OpAMD64VPSHUFD128
- return true
- case OpPermuteConstantLoGroupedInt16x16:
- v.Op = OpAMD64VPSHUFHW256
- return true
- case OpPermuteConstantLoGroupedInt16x32:
- v.Op = OpAMD64VPSHUFHW512
- return true
- case OpPermuteConstantLoGroupedUint16x16:
- v.Op = OpAMD64VPSHUFHW256
- return true
- case OpPermuteConstantLoGroupedUint16x32:
- v.Op = OpAMD64VPSHUFHW512
- return true
- case OpPermuteConstantLoInt16x8:
- v.Op = OpAMD64VPSHUFHW128
- return true
- case OpPermuteConstantLoInt32x4:
- v.Op = OpAMD64VPSHUFHW128
- return true
- case OpPermuteConstantLoUint16x8:
- v.Op = OpAMD64VPSHUFHW128
- return true
- case OpPermuteConstantLoUint32x4:
- v.Op = OpAMD64VPSHUFHW128
- return true
- case OpPermuteConstantUint32x4:
- v.Op = OpAMD64VPSHUFD128
- return true
case OpPermuteFloat32x16:
v.Op = OpAMD64VPERMPS512
return true
case OpPermuteFloat64x8:
v.Op = OpAMD64VPERMPD512
return true
- case OpPermuteGroupedInt8x32:
- v.Op = OpAMD64VPSHUFB256
- return true
- case OpPermuteGroupedInt8x64:
- v.Op = OpAMD64VPSHUFB512
- return true
- case OpPermuteGroupedUint8x32:
- v.Op = OpAMD64VPSHUFB256
- return true
- case OpPermuteGroupedUint8x64:
- v.Op = OpAMD64VPSHUFB512
- return true
case OpPermuteInt16x16:
v.Op = OpAMD64VPERMW256
return true
v.Op = OpAMD64VPERMQ512
return true
case OpPermuteInt8x16:
- v.Op = OpAMD64VPSHUFB128
+ v.Op = OpAMD64VPERMB128
return true
case OpPermuteInt8x32:
v.Op = OpAMD64VPERMB256
case OpPermuteInt8x64:
v.Op = OpAMD64VPERMB512
return true
+ case OpPermuteOrZeroGroupedInt8x32:
+ v.Op = OpAMD64VPSHUFB256
+ return true
+ case OpPermuteOrZeroGroupedInt8x64:
+ v.Op = OpAMD64VPSHUFB512
+ return true
+ case OpPermuteOrZeroGroupedUint8x32:
+ v.Op = OpAMD64VPSHUFB256
+ return true
+ case OpPermuteOrZeroGroupedUint8x64:
+ v.Op = OpAMD64VPSHUFB512
+ return true
+ case OpPermuteOrZeroInt8x16:
+ v.Op = OpAMD64VPSHUFB128
+ return true
+ case OpPermuteOrZeroUint8x16:
+ v.Op = OpAMD64VPSHUFB128
+ return true
case OpPermuteUint16x16:
v.Op = OpAMD64VPERMW256
return true
v.Op = OpAMD64VPERMQ512
return true
case OpPermuteUint8x16:
- v.Op = OpAMD64VPSHUFB128
+ v.Op = OpAMD64VPERMB128
return true
case OpPermuteUint8x32:
v.Op = OpAMD64VPERMB256
case OpconcatSelectedConstantUint64x2:
v.Op = OpAMD64VSHUFPD128
return true
+ case OppermuteScalarsGroupedInt32x16:
+ v.Op = OpAMD64VPSHUFD512
+ return true
+ case OppermuteScalarsGroupedInt32x8:
+ v.Op = OpAMD64VPSHUFD256
+ return true
+ case OppermuteScalarsGroupedUint32x16:
+ v.Op = OpAMD64VPSHUFD512
+ return true
+ case OppermuteScalarsGroupedUint32x8:
+ v.Op = OpAMD64VPSHUFD256
+ return true
+ case OppermuteScalarsHiGroupedInt16x16:
+ v.Op = OpAMD64VPSHUFHW256
+ return true
+ case OppermuteScalarsHiGroupedInt16x32:
+ v.Op = OpAMD64VPSHUFHW512
+ return true
+ case OppermuteScalarsHiGroupedUint16x16:
+ v.Op = OpAMD64VPSHUFHW256
+ return true
+ case OppermuteScalarsHiGroupedUint16x32:
+ v.Op = OpAMD64VPSHUFHW512
+ return true
+ case OppermuteScalarsHiInt16x8:
+ v.Op = OpAMD64VPSHUFHW128
+ return true
+ case OppermuteScalarsHiUint16x8:
+ v.Op = OpAMD64VPSHUFHW128
+ return true
+ case OppermuteScalarsInt32x4:
+ v.Op = OpAMD64VPSHUFD128
+ return true
+ case OppermuteScalarsLoGroupedInt16x16:
+ v.Op = OpAMD64VPSHUFLW256
+ return true
+ case OppermuteScalarsLoGroupedInt16x32:
+ v.Op = OpAMD64VPSHUFLW512
+ return true
+ case OppermuteScalarsLoGroupedUint16x16:
+ v.Op = OpAMD64VPSHUFLW256
+ return true
+ case OppermuteScalarsLoGroupedUint16x32:
+ v.Op = OpAMD64VPSHUFLW512
+ return true
+ case OppermuteScalarsLoInt16x8:
+ v.Op = OpAMD64VPSHUFLW128
+ return true
+ case OppermuteScalarsLoUint16x8:
+ v.Op = OpAMD64VPSHUFLW128
+ return true
+ case OppermuteScalarsUint32x4:
+ v.Op = OpAMD64VPSHUFD128
+ return true
case OpternInt32x16:
v.Op = OpAMD64VPTERNLOGD512
return true
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask)
+ // result: (VPERMI2WMasked128 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2W128 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2WMasked128)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU16Masked128 (VPMOVWB128_128 x) mask)
// result: (VPMOVWBMasked128_128 x mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask)
- // result: (VPERMI2WMasked128 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2W128 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2WMasked128)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask)
- // result: (VPSHUFHWMasked128 [a] x mask)
- for {
- if v_0.Op != OpAMD64VPSHUFHW128 {
- break
- }
- a := auxIntToUint8(v_0.AuxInt)
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPSHUFHWMasked128)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked128 (VPERMW128 x y) mask)
// result: (VPERMWMasked128 x y mask)
for {
v.AddArg3(x, y, mask)
return true
}
+ // match: (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask)
+ // result: (VPSHUFHWMasked128 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFHW128 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFHWMasked128)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
+ // match: (VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask)
+ // result: (VPSHUFLWMasked128 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFLW128 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFLWMasked128)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
// match: (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask)
// result: (VPSLLWMasked128const [a] x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask)
+ // result: (VPERMI2WMasked256 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2W256 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2WMasked256)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU16Masked256 (VPMOVWB128_256 x) mask)
// result: (VPMOVWBMasked128_256 x mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask)
- // result: (VPERMI2WMasked256 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2W256 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2WMasked256)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask)
- // result: (VPSHUFHWMasked256 [a] x mask)
- for {
- if v_0.Op != OpAMD64VPSHUFHW256 {
- break
- }
- a := auxIntToUint8(v_0.AuxInt)
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPSHUFHWMasked256)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked256 (VPERMW256 x y) mask)
// result: (VPERMWMasked256 x y mask)
for {
v.AddArg3(x, y, mask)
return true
}
+ // match: (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask)
+ // result: (VPSHUFHWMasked256 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFHW256 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFHWMasked256)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
+ // match: (VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask)
+ // result: (VPSHUFLWMasked256 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFLW256 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFLWMasked256)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
// match: (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask)
// result: (VPSLLWMasked256const [a] x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask)
+ // result: (VPERMI2WMasked512 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2W512 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2WMasked512)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU16Masked512 (VPMOVSXWD512 x) mask)
// result: (VPMOVSXWDMasked512 x mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask)
- // result: (VPERMI2WMasked512 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2W512 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2WMasked512)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask)
- // result: (VPSHUFHWMasked512 [a] x mask)
- for {
- if v_0.Op != OpAMD64VPSHUFHW512 {
- break
- }
- a := auxIntToUint8(v_0.AuxInt)
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPSHUFHWMasked512)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU16Masked512 (VPERMW512 x y) mask)
// result: (VPERMWMasked512 x y mask)
for {
v.AddArg3(x, y, mask)
return true
}
+ // match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask)
+ // result: (VPSHUFHWMasked512 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFHW512 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFHWMasked512)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
+ // match: (VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask)
+ // result: (VPSHUFLWMasked512 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFLW512 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFLWMasked512)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
// match: (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask)
// result: (VPSLLWMasked512const [a] x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask)
+ // result: (VPERMI2PSMasked128 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2PS128 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2PSMasked128)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
+ // match: (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask)
+ // result: (VPERMI2DMasked128 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2D128 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2DMasked128)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU32Masked128 (VPMOVDB128_128 x) mask)
// result: (VPMOVDBMasked128_128 x mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask)
- // result: (VPERMI2PSMasked128 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2PS128 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2PSMasked128)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask)
- // result: (VPERMI2DMasked128 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2D128 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2DMasked128)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask)
- // result: (VPSHUFDMasked128 [a] x mask)
- for {
- if v_0.Op != OpAMD64VPSHUFD128 {
- break
- }
- a := auxIntToUint8(v_0.AuxInt)
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPSHUFDMasked128)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked128 (VPROLD128 [a] x) mask)
// result: (VPROLDMasked128 [a] x mask)
for {
v.AddArg3(x, y, mask)
return true
}
+ // match: (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask)
+ // result: (VPSHUFDMasked128 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFD128 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFDMasked128)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
// match: (VMOVDQU32Masked128 (VPSLLD128const [a] x) mask)
// result: (VPSLLDMasked128const [a] x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask)
+ // result: (VPERMI2PSMasked256 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2PS256 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2PSMasked256)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
+ // match: (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask)
+ // result: (VPERMI2DMasked256 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2D256 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2DMasked256)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU32Masked256 (VPMOVDB128_256 x) mask)
// result: (VPMOVDBMasked128_256 x mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask)
- // result: (VPERMI2PSMasked256 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2PS256 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2PSMasked256)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask)
- // result: (VPERMI2DMasked256 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2D256 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2DMasked256)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask)
- // result: (VPSHUFDMasked256 [a] x mask)
- for {
- if v_0.Op != OpAMD64VPSHUFD256 {
- break
- }
- a := auxIntToUint8(v_0.AuxInt)
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPSHUFDMasked256)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked256 (VPERMPS256 x y) mask)
// result: (VPERMPSMasked256 x y mask)
for {
v.AddArg3(x, y, mask)
return true
}
+ // match: (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask)
+ // result: (VPSHUFDMasked256 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFD256 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFDMasked256)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
// match: (VMOVDQU32Masked256 (VPSLLD256const [a] x) mask)
// result: (VPSLLDMasked256const [a] x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask)
+ // result: (VPERMI2PSMasked512 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2PS512 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2PSMasked512)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
+ // match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask)
+ // result: (VPERMI2DMasked512 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2D512 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2DMasked512)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU32Masked512 (VPMOVDB128_512 x) mask)
// result: (VPMOVDBMasked128_512 x mask)
for {
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask)
- // result: (VPERMI2PSMasked512 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2PS512 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2PSMasked512)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask)
- // result: (VPERMI2DMasked512 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2D512 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2DMasked512)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask)
- // result: (VPSHUFDMasked512 [a] x mask)
- for {
- if v_0.Op != OpAMD64VPSHUFD512 {
- break
- }
- a := auxIntToUint8(v_0.AuxInt)
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPSHUFDMasked512)
- v.AuxInt = uint8ToAuxInt(a)
- v.AddArg2(x, mask)
- return true
- }
// match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask)
// result: (VPERMPSMasked512 x y mask)
for {
v.AddArg3(x, y, mask)
return true
}
+ // match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask)
+ // result: (VPSHUFDMasked512 [a] x mask)
+ for {
+ if v_0.Op != OpAMD64VPSHUFD512 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPSHUFDMasked512)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg2(x, mask)
+ return true
+ }
// match: (VMOVDQU32Masked512 (VPSLLD512const [a] x) mask)
// result: (VPSLLDMasked512const [a] x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask)
+ // result: (VPERMI2PDMasked128 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2PD128 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2PDMasked128)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
+ // match: (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask)
+ // result: (VPERMI2QMasked128 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2Q128 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2QMasked128)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU64Masked128 (VPMOVQB128_128 x) mask)
// result: (VPMOVQBMasked128_128 x mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask)
- // result: (VPERMI2PDMasked128 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2PD128 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2PDMasked128)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask)
- // result: (VPERMI2QMasked128 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2Q128 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2QMasked128)
- v.AddArg4(x, y, z, mask)
- return true
- }
// match: (VMOVDQU64Masked128 (VRCP14PD128 x) mask)
// result: (VRCP14PDMasked128 x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask)
+ // result: (VPERMI2PDMasked256 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2PD256 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2PDMasked256)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
+ // match: (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask)
+ // result: (VPERMI2QMasked256 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2Q256 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2QMasked256)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU64Masked256 (VPMOVQB128_256 x) mask)
// result: (VPMOVQBMasked128_256 x mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask)
- // result: (VPERMI2PDMasked256 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2PD256 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2PDMasked256)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask)
- // result: (VPERMI2QMasked256 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2Q256 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2QMasked256)
- v.AddArg4(x, y, z, mask)
- return true
- }
// match: (VMOVDQU64Masked256 (VPERMPD256 x y) mask)
// result: (VPERMPDMasked256 x y mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask)
+ // result: (VPERMI2PDMasked512 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2PD512 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2PDMasked512)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
+ // match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask)
+ // result: (VPERMI2QMasked512 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2Q512 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2QMasked512)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU64Masked512 (VPMOVQB128_512 x) mask)
// result: (VPMOVQBMasked128_512 x mask)
for {
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask)
- // result: (VPERMI2PDMasked512 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2PD512 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2PDMasked512)
- v.AddArg4(x, y, z, mask)
- return true
- }
- // match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask)
- // result: (VPERMI2QMasked512 x y z mask)
- for {
- if v_0.Op != OpAMD64VPERMI2Q512 {
- break
- }
- z := v_0.Args[2]
- x := v_0.Args[0]
- y := v_0.Args[1]
- mask := v_1
- v.reset(OpAMD64VPERMI2QMasked512)
- v.AddArg4(x, y, z, mask)
- return true
- }
// match: (VMOVDQU64Masked512 (VPERMPD512 x y) mask)
// result: (VPERMPDMasked512 x y mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask)
+ // result: (VPERMI2BMasked128 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2B128 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2BMasked128)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask)
// result: (VPALIGNRMasked128 [a] x y mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask)
- // result: (VPERMI2BMasked128 x y z mask)
+ // match: (VMOVDQU8Masked128 (VPERMB128 x y) mask)
+ // result: (VPERMBMasked128 x y mask)
for {
- if v_0.Op != OpAMD64VPERMI2B128 {
+ if v_0.Op != OpAMD64VPERMB128 {
break
}
- z := v_0.Args[2]
- x := v_0.Args[0]
y := v_0.Args[1]
+ x := v_0.Args[0]
mask := v_1
- v.reset(OpAMD64VPERMI2BMasked128)
- v.AddArg4(x, y, z, mask)
+ v.reset(OpAMD64VPERMBMasked128)
+ v.AddArg3(x, y, mask)
return true
}
// match: (VMOVDQU8Masked128 (VPSHUFB128 x y) mask)
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask)
+ // result: (VPERMI2BMasked256 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2B256 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2BMasked256)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask)
// result: (VPALIGNRMasked256 [a] x y mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask)
- // result: (VPERMI2BMasked256 x y z mask)
+ // match: (VMOVDQU8Masked256 (VPERMB256 x y) mask)
+ // result: (VPERMBMasked256 x y mask)
for {
- if v_0.Op != OpAMD64VPERMI2B256 {
+ if v_0.Op != OpAMD64VPERMB256 {
break
}
- z := v_0.Args[2]
- x := v_0.Args[0]
y := v_0.Args[1]
+ x := v_0.Args[0]
mask := v_1
- v.reset(OpAMD64VPERMI2BMasked256)
- v.AddArg4(x, y, z, mask)
+ v.reset(OpAMD64VPERMBMasked256)
+ v.AddArg3(x, y, mask)
return true
}
// match: (VMOVDQU8Masked256 (VPSHUFB256 x y) mask)
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU8Masked256 (VPERMB256 x y) mask)
- // result: (VPERMBMasked256 x y mask)
- for {
- if v_0.Op != OpAMD64VPERMB256 {
- break
- }
- y := v_0.Args[1]
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPERMBMasked256)
- v.AddArg3(x, y, mask)
- return true
- }
// match: (VMOVDQU8Masked256 (VPSUBB256 x y) mask)
// result: (VPSUBBMasked256 x y mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask)
+ // result: (VPERMI2BMasked512 x y z mask)
+ for {
+ if v_0.Op != OpAMD64VPERMI2B512 {
+ break
+ }
+ z := v_0.Args[2]
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ mask := v_1
+ v.reset(OpAMD64VPERMI2BMasked512)
+ v.AddArg4(x, y, z, mask)
+ return true
+ }
// match: (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask)
// result: (VPALIGNRMasked512 [a] x y mask)
for {
v.AddArg2(x, mask)
return true
}
- // match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask)
- // result: (VPERMI2BMasked512 x y z mask)
+ // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask)
+ // result: (VPERMBMasked512 x y mask)
for {
- if v_0.Op != OpAMD64VPERMI2B512 {
+ if v_0.Op != OpAMD64VPERMB512 {
break
}
- z := v_0.Args[2]
- x := v_0.Args[0]
y := v_0.Args[1]
+ x := v_0.Args[0]
mask := v_1
- v.reset(OpAMD64VPERMI2BMasked512)
- v.AddArg4(x, y, z, mask)
+ v.reset(OpAMD64VPERMBMasked512)
+ v.AddArg3(x, y, mask)
return true
}
// match: (VMOVDQU8Masked512 (VPSHUFB512 x y) mask)
v.AddArg3(x, y, mask)
return true
}
- // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask)
- // result: (VPERMBMasked512 x y mask)
- for {
- if v_0.Op != OpAMD64VPERMB512 {
- break
- }
- y := v_0.Args[1]
- x := v_0.Args[0]
- mask := v_1
- v.reset(OpAMD64VPERMBMasked512)
- v.AddArg3(x, y, mask)
- return true
- }
// match: (VMOVDQU8Masked512 (VPSUBB512 x y) mask)
// result: (VPSUBBMasked512 x y mask)
for {
v.AddArg3(dst, x, mask)
return true
}
+ // match: (VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask)
+ // result: (VPSHUFLWMasked512Merging dst [a] x mask)
+ for {
+ dst := v_0
+ if v_1.Op != OpAMD64VPSHUFLW512 {
+ break
+ }
+ a := auxIntToUint8(v_1.AuxInt)
+ x := v_1.Args[0]
+ mask := v_2
+ v.reset(OpAMD64VPSHUFLWMasked512Merging)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg3(dst, x, mask)
+ return true
+ }
// match: (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask)
// result: (VPSLLVWMasked512Merging dst x y mask)
for {
v.AddArg3(dst, x, v0)
return true
}
+ // match: (VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask)
+ // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+ // result: (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
+ for {
+ dst := v_0
+ if v_1.Op != OpAMD64VPSHUFLW128 {
+ break
+ }
+ a := auxIntToUint8(v_1.AuxInt)
+ x := v_1.Args[0]
+ mask := v_2
+ if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+ break
+ }
+ v.reset(OpAMD64VPSHUFLWMasked128Merging)
+ v.AuxInt = uint8ToAuxInt(a)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg3(dst, x, v0)
+ return true
+ }
// match: (VPBLENDVB128 dst (VPSLLD128const [a] x) mask)
// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
// result: (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
v.AddArg3(dst, x, v0)
return true
}
+ // match: (VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask)
+ // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+ // result: (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
+ for {
+ dst := v_0
+ if v_1.Op != OpAMD64VPSHUFLW256 {
+ break
+ }
+ a := auxIntToUint8(v_1.AuxInt)
+ x := v_1.Args[0]
+ mask := v_2
+ if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+ break
+ }
+ v.reset(OpAMD64VPSHUFLWMasked256Merging)
+ v.AuxInt = uint8ToAuxInt(a)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg3(dst, x, v0)
+ return true
+ }
// match: (VPBLENDVB256 dst (VPSLLD256const [a] x) mask)
// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
// result: (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x32, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x64, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x64, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x8, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x32, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x32, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Float64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x2, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Float64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
- addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
- addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
- addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
- addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroInt8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Uint8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroUint8x16, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x32, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x64, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x32, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsInt32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsUint32x4, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiInt16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiUint16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Int16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoInt16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoUint16x8, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Int16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Int16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64)
if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
continue
}
+ if op.SkipMaskedMethod() {
+ continue
+ }
_, _, _, immType, gOp := op.shape()
gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
if immType == VarImm || immType == ConstVarImm {
if op.NoTypes != nil && *op.NoTypes == "true" {
continue
}
+ if op.SkipMaskedMethod() {
+ continue
+ }
if s, op, err := classifyOp(op); err == nil {
if err := t.ExecuteTemplate(buffer, s, op); err != nil {
panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))
if op.NoTypes != nil && *op.NoTypes == "true" {
continue
}
+ if op.SkipMaskedMethod() {
+ continue
+ }
idxVecAsScalar, err := checkVecAsScalar(op)
if err != nil {
panic(err)
data.ArgsOut = "..."
}
data.tplName = tplName
- if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" {
+ if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
+ opr.SkipMaskedMethod() {
optData = append(optData, data)
continue
}
NoGenericOps *string
// If non-nil, this string will be attached to the machine ssa op name. E.g. "const"
SSAVariant *string
+ // If true, do not emit method declarations, generic ops, or intrinsics for masked variants
+ // DO emit the architecture-specific opcodes and optimizations.
+ HideMaskMethods *bool
+}
+
+func (o *Operation) IsMasked() bool {
+ if len(o.InVariant) == 0 {
+ return false
+ }
+ if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
+ return true
+ }
+ panic(fmt.Errorf("unknown inVariant"))
+}
+
+func (o *Operation) SkipMaskedMethod() bool {
+ if o.HideMaskMethods == nil {
+ return false
+ }
+ if *o.HideMaskMethods && o.IsMasked() {
+ return true
+ }
+ return false
}
func (o *Operation) DecodeUnified(v *unify.Value) error {
return err
}
- isMasked := false
- if len(o.InVariant) == 0 {
- // No variant
- } else if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
- isMasked = true
- } else {
- return fmt.Errorf("unknown inVariant")
- }
+ isMasked := o.IsMasked()
// Compute full Go method name.
o.Go = o.rawOperation.Go
o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go)
if isMasked {
o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
+ // Suppress generic op and method declaration for exported methods, if a mask is present.
if unicode.IsUpper([]rune(o.Go)[0]) {
trueVal := "true"
o.NoGenericOps = &trueVal
constImm: 1
documentation: !string |-
// NAME returns the upper half of x.
+- go: PermuteOrZero
+ commutative: false
+ documentation: !string |-
+ // NAME performs a full permutation of vector x using indices:
+ // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
- go: Permute
commutative: false
documentation: !string |-
// NAME performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
- // Only the needed bits to represent x's index are used in indices' elements.
-- go: Permute2 # Permute2 is only available on or after AVX512
+- go: ConcatPermute # ConcatPermute is only available on or after AVX512
commutative: false
documentation: !string |-
// NAME performs a full permutation of vector x, y using indices:
// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
- // where xy is x appending y.
+ // where xy is the concatenation of x (lower half) and y (upper half).
// Only the needed bits to represent xy's index are used in indices' elements.
- go: Compress
commutative: false
documentation: !string |-
// NAME copies element zero of its (128-bit) input to all elements of
// the 512-bit output vector.
+- go: PermuteOrZeroGrouped
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a grouped permutation of vector x using indices:
- go: PermuteGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using indices:
-- go: PermuteConstant
+- go: permuteScalars
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantGrouped
+- go: permuteScalarsGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
-- go: PermuteConstantLo
+- go: permuteScalarsLo
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantLoGrouped
+- go: permuteScalarsLoGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
-- go: PermuteConstantHi
+- go: permuteScalarsHi
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantHiGrouped
+- go: permuteScalarsHiGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
- go: Select128FromPair
commutative: false
documentation: !string |-
- // NAME selects the low and high 128-bit halves from the 128-bit halves
- // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+ // NAME treats the 256-bit vectors x and y as a single vector of four
+ // 128-bit elements, and returns a 256-bit result formed by
+ // concatenating the two elements specified by lo and hi.
+ // For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}.
- go: ConcatShiftBytesRight
commutative: false
- *f64xN
- go: Permute
- asm: "VPERM[BWDQ]|VPERMP[SD]"
+ asm: "VPERMQ|VPERMPD"
+ addDoc: !string |-
+ // The low 2 bits (values 0-3) of each element of indices is used
operandOrder: "21Type1"
in:
- &anyindices
go: $t
name: indices
overwriteBase: uint
+ - &any4
+ go: $t
+ lanes: 4
+ out:
- &any
go: $t
+
+- go: Permute
+ asm: "VPERM[WDQ]|VPERMP[SD]"
+ addDoc: !string |-
+ // The low 3 bits (values 0-7) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - *anyindices
+ - &any8
+ go: $t
+ lanes: 8
+ out:
+ - *any
+
+- go: Permute
+ asm: "VPERM[BWD]|VPERMPS"
+ addDoc: !string |-
+ // The low 4 bits (values 0-15) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - *anyindices
+ - &any16
+ go: $t
+ lanes: 16
out:
- *any
-- go: Permute2
+- go: Permute
+ asm: "VPERM[BW]"
+ addDoc: !string |-
+ // The low 5 bits (values 0-31) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - *anyindices
+ - &any32
+ go: $t
+ lanes: 32
+ out:
+ - *any
+
+- go: Permute
+ asm: "VPERMB"
+ addDoc: !string |-
+ // The low 6 bits (values 0-63) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - *anyindices
+ - &any64
+ go: $t
+ lanes: 64
+ out:
+ - *any
+
+- go: ConcatPermute
asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
# Because we are overwriting the receiver's type, we
# have to move the receiver to be a parameter so that
base: $b
# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
-- go: Permute
+- go: PermuteOrZero
asm: VPSHUFB
addDoc: !string |-
- // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+ // The lower four bits of each byte-sized index in indices select an element from x,
+ // unless the index's sign bit is set in which case zero is used instead.
in:
- &128any
bits: 128
go: $t
- bits: 128
- go: $t
name: indices
+ base: int # always signed
out:
- *128any
-- go: PermuteGrouped
+
+- go: PermuteOrZeroGrouped
asm: VPSHUFB
addDoc: !string |-
- // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
- // Only the needed bits to represent the index of a group of x are used in indices' elements.
- // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+ // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+ // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+ // unless the index's sign bit is set in which case zero is used instead.
// Each group is of size 128-bit.
in:
- &256Or512any
bits: "256|512"
go: $t
- bits: "256|512"
- go: $t
+ base: int
name: indices
out:
- *256Or512any
-- go: PermuteConstant
+- go: permuteScalars
asm: VPSHUFD
addDoc: !string |-
- // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
- // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+ // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in:
- *128any
- class: immediate
immOffset: 0
name: indices
+ hideMaskMethods: true
out:
- *128any
-- go: PermuteConstantGrouped
+
+- go: permuteScalarsGrouped
asm: VPSHUFD
addDoc: !string |-
- // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
- // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+ // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- class: immediate
immOffset: 0
name: indices
+ hideMaskMethods: true
out:
- *256Or512any
-- go: PermuteConstantLo
- asm: VPSHUFHW
+- go: permuteScalarsLo
+ asm: VPSHUFLW
addDoc: !string |-
- // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
- // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+ // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in:
- - *128any
+ - &128lanes8
+ bits: 128
+ go: $t
+ elemBits: 16
- class: immediate
immOffset: 0
name: indices
+ hideMaskMethods: true
out:
- - *128any
-- go: PermuteConstantLoGrouped
- asm: VPSHUFHW
+ - *128lanes8
+
+- go: permuteScalarsLoGrouped
+ asm: VPSHUFLW
addDoc: !string |-
- // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
- // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+ //
+ // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+ // x_group1[indices[0:2]], ...}
+ //
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- - *256Or512any
+ - &256Or512lanes8
+ bits: "256|512"
+ go: $t
+ elemBits: 16
- class: immediate
immOffset: 0
name: indices
+ hideMaskMethods: true
out:
- - *256Or512any
+ - *256Or512lanes8
-- go: PermuteConstantHi
+- go: permuteScalarsHi
asm: VPSHUFHW
addDoc: !string |-
- // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
- // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+ // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in:
- - *128any
+ - *128lanes8
- class: immediate
immOffset: 0
name: indices
+ hideMaskMethods: true
out:
- - *128any
-- go: PermuteConstantHiGrouped
+ - *128lanes8
+
+- go: permuteScalarsHiGrouped
asm: VPSHUFHW
addDoc: !string |-
- // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
- // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+ // result =
+ //
+ // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+ // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+ //
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- - *256Or512any
+ - *256Or512lanes8
- class: immediate
immOffset: 0
name: indices
+ hideMaskMethods: true
out:
- - *256Or512any
+ - *256Or512lanes8
- go: InterleaveHi
asm: VPUNPCKH(QDQ|DQ|WD|WB)
}
}
-func TestPermute2(t *testing.T) {
+func TestPermuteOrZero(t *testing.T) {
+ x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
+ want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
+ got := make([]uint8, len(x))
+ simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
+ for i := range 8 {
+ if want[i] != got[i] {
+ t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+ }
+ }
+}
+
+func TestConcatPermute(t *testing.T) {
if !simd.X86.AVX512() {
t.Skip("Test requires X86.AVX512, not available on this hardware")
return
indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
got := make([]int64, 8)
- simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+ simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
for i := range 8 {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
+
+func TestPermuteScalars(t *testing.T) {
+ x := []int32{11, 12, 13, 14}
+ want := []int32{12, 13, 14, 11}
+ got := make([]int32, 4)
+ simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
+ for i := range 4 {
+ if want[i] != got[i] {
+ t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+ }
+ }
+}
+
+func TestPermuteScalarsGrouped(t *testing.T) {
+ x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
+ want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
+ got := make([]int32, 8)
+ simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
+ for i := range 8 {
+ if want[i] != got[i] {
+ t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+ }
+ }
+}
+
+func TestPermuteScalarsHi(t *testing.T) {
+ x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
+ want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
+ got := make([]int16, len(x))
+ simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
+ for i := range got {
+ if want[i] != got[i] {
+ t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+ }
+ }
+}
+
+func TestPermuteScalarsLo(t *testing.T) {
+ x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
+ want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
+ got := make([]int16, len(x))
+ simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
+ for i := range got {
+ if want[i] != got[i] {
+ t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+ }
+ }
+}
+
+func TestPermuteScalarsHiGrouped(t *testing.T) {
+ x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
+ want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
+ got := make([]int16, len(x))
+ simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
+ for i := range got {
+ if want[i] != got[i] {
+ t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+ }
+ }
+}
+
+func TestPermuteScalarsLoGrouped(t *testing.T) {
+ x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
+ want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
+ got := make([]int16, len(x))
+ simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
+ for i := range got {
+ if want[i] != got[i] {
+ t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+ }
+ }
+}
// Asm: VPCOMPRESSQ, CPU Feature: AVX512
func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
+/* ConcatPermute */
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x16) ConcatPermute(y Int8x16, indices Uint8x16) Int8x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x16) ConcatPermute(y Uint8x16, indices Uint8x16) Uint8x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x32) ConcatPermute(y Int8x32, indices Uint8x32) Int8x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x32) ConcatPermute(y Uint8x32, indices Uint8x32) Uint8x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x64) ConcatPermute(y Int8x64, indices Uint8x64) Int8x64
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x64) ConcatPermute(y Uint8x64, indices Uint8x64) Uint8x64
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x8) ConcatPermute(y Int16x8, indices Uint16x8) Int16x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x8) ConcatPermute(y Uint16x8, indices Uint16x8) Uint16x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x16) ConcatPermute(y Int16x16, indices Uint16x16) Int16x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x16) ConcatPermute(y Uint16x16, indices Uint16x16) Uint16x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x32) ConcatPermute(y Int16x32, indices Uint16x32) Int16x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x32) ConcatPermute(y Uint16x32, indices Uint16x32) Uint16x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x4) ConcatPermute(y Float32x4, indices Uint32x4) Float32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x4) ConcatPermute(y Int32x4, indices Uint32x4) Int32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x4) ConcatPermute(y Uint32x4, indices Uint32x4) Uint32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x8) ConcatPermute(y Float32x8, indices Uint32x8) Float32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x8) ConcatPermute(y Int32x8, indices Uint32x8) Int32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x8) ConcatPermute(y Uint32x8, indices Uint32x8) Uint32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x16) ConcatPermute(y Float32x16, indices Uint32x16) Float32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x16) ConcatPermute(y Int32x16, indices Uint32x16) Int32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x16) ConcatPermute(y Uint32x16, indices Uint32x16) Uint32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x2) ConcatPermute(y Float64x2, indices Uint64x2) Float64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x2) ConcatPermute(y Int64x2, indices Uint64x2) Int64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x2) ConcatPermute(y Uint64x2, indices Uint64x2) Uint64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x4) ConcatPermute(y Float64x4, indices Uint64x4) Float64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x4) ConcatPermute(y Int64x4, indices Uint64x4) Int64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x4) ConcatPermute(y Uint64x4, indices Uint64x4) Uint64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x8) ConcatPermute(y Float64x8, indices Uint64x8) Float64x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x8) ConcatPermute(y Int64x8, indices Uint64x8) Int64x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8
+
/* ConcatShiftBytesRight */
// ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes.
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// The low 4 bits (values 0-15) of each element of indices is used
//
-// Asm: VPSHUFB, CPU Feature: AVX
-func (x Int8x16) Permute(indices Int8x16) Int8x16
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute(indices Uint8x16) Int8x16
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// The low 4 bits (values 0-15) of each element of indices is used
//
-// Asm: VPSHUFB, CPU Feature: AVX
+// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x32) Permute(indices Uint8x32) Int8x32
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 6 bits (values 0-63) of each element of indices is used
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x64) Permute(indices Uint8x64) Int8x64
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 6 bits (values 0-63) of each element of indices is used
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
//
// Asm: VPERMW, CPU Feature: AVX512
func (x Int16x8) Permute(indices Uint16x8) Int16x8
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
//
// Asm: VPERMW, CPU Feature: AVX512
func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
//
// Asm: VPERMW, CPU Feature: AVX512
func (x Int16x16) Permute(indices Uint16x16) Int16x16
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
//
// Asm: VPERMW, CPU Feature: AVX512
func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
//
// Asm: VPERMW, CPU Feature: AVX512
func (x Int16x32) Permute(indices Uint16x32) Int16x32
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
//
// Asm: VPERMW, CPU Feature: AVX512
func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
//
// Asm: VPERMPS, CPU Feature: AVX2
func (x Float32x8) Permute(indices Uint32x8) Float32x8
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
//
// Asm: VPERMD, CPU Feature: AVX2
func (x Int32x8) Permute(indices Uint32x8) Int32x8
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
//
// Asm: VPERMD, CPU Feature: AVX2
func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
//
// Asm: VPERMPS, CPU Feature: AVX512
func (x Float32x16) Permute(indices Uint32x16) Float32x16
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
//
// Asm: VPERMD, CPU Feature: AVX512
func (x Int32x16) Permute(indices Uint32x16) Int32x16
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
//
// Asm: VPERMD, CPU Feature: AVX512
func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
//
// Asm: VPERMPD, CPU Feature: AVX512
func (x Float64x4) Permute(indices Uint64x4) Float64x4
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
//
// Asm: VPERMQ, CPU Feature: AVX512
func (x Int64x4) Permute(indices Uint64x4) Int64x4
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
//
// Asm: VPERMQ, CPU Feature: AVX512
func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
//
// Asm: VPERMPD, CPU Feature: AVX512
func (x Float64x8) Permute(indices Uint64x8) Float64x8
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
//
// Asm: VPERMQ, CPU Feature: AVX512
func (x Int64x8) Permute(indices Uint64x8) Int64x8
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
//
// Asm: VPERMQ, CPU Feature: AVX512
func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
-/* Permute2 */
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x16) Permute2(y Int8x16, indices Uint8x16) Int8x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x16) Permute2(y Uint8x16, indices Uint8x16) Uint8x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x32) Permute2(y Int8x32, indices Uint8x32) Int8x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x32) Permute2(y Uint8x32, indices Uint8x32) Uint8x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x64) Permute2(y Int8x64, indices Uint8x64) Int8x64
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x64) Permute2(y Uint8x64, indices Uint8x64) Uint8x64
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x8) Permute2(y Int16x8, indices Uint16x8) Int16x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x8) Permute2(y Uint16x8, indices Uint16x8) Uint16x8
+/* PermuteOrZero */
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x16) Permute2(y Int16x16, indices Uint16x16) Int16x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x16) Permute2(y Uint16x16, indices Uint16x16) Uint16x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x32) Permute2(y Int16x32, indices Uint16x32) Int16x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x32) Permute2(y Uint16x32, indices Uint16x32) Uint16x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x4) Permute2(y Float32x4, indices Uint32x4) Float32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x4) Permute2(y Int32x4, indices Uint32x4) Int32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x4) Permute2(y Uint32x4, indices Uint32x4) Uint32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x8) Permute2(y Float32x8, indices Uint32x8) Float32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x8) Permute2(y Int32x8, indices Uint32x8) Int32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x8) Permute2(y Uint32x8, indices Uint32x8) Uint32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x16) Permute2(y Float32x16, indices Uint32x16) Float32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x16) Permute2(y Int32x16, indices Uint32x16) Int32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x16) Permute2(y Uint32x16, indices Uint32x16) Uint32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x2) Permute2(y Float64x2, indices Uint64x2) Float64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x2) Permute2(y Int64x2, indices Uint64x2) Int64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x2) Permute2(y Uint64x2, indices Uint64x2) Uint64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x4) Permute2(y Float64x4, indices Uint64x4) Float64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x4) Permute2(y Int64x4, indices Uint64x4) Int64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x4) Permute2(y Uint64x4, indices Uint64x4) Uint64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x8) Permute2(y Float64x8, indices Uint64x8) Float64x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
-
-/* PermuteConstant */
-
-// PermuteConstant performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Int32x4) PermuteConstant(indices uint8) Int32x4
-
-// PermuteConstant performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4
-
-/* PermuteConstantGrouped */
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16
-
-/* PermuteConstantHi */
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4
-
-/* PermuteConstantHiGrouped */
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32
-
-/* PermuteConstantLo */
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4
-
-/* PermuteConstantLoGrouped */
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// PermuteOrZero performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The lower four bits of each byte-sized index in indices select an element from x,
+// unless the index's sign bit is set in which case zero is used instead.
//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Int8x16) PermuteOrZero(indices Int8x16) Int8x16
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// PermuteOrZero performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The lower four bits of each byte-sized index in indices select an element from x,
+// unless the index's sign bit is set in which case zero is used instead.
//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Uint8x16) PermuteOrZero(indices Int8x16) Uint8x16
-/* PermuteGrouped */
+/* PermuteOrZeroGrouped */
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
// Each group is of size 128-bit.
//
// Asm: VPSHUFB, CPU Feature: AVX2
-func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32
+func (x Int8x32) PermuteOrZeroGrouped(indices Int8x32) Int8x32
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
// Each group is of size 128-bit.
//
// Asm: VPSHUFB, CPU Feature: AVX512
-func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64
+func (x Int8x64) PermuteOrZeroGrouped(indices Int8x64) Int8x64
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
// Each group is of size 128-bit.
//
// Asm: VPSHUFB, CPU Feature: AVX2
-func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32
+func (x Uint8x32) PermuteOrZeroGrouped(indices Int8x32) Uint8x32
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
// Each group is of size 128-bit.
//
// Asm: VPSHUFB, CPU Feature: AVX512
-func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64
+func (x Uint8x64) PermuteOrZeroGrouped(indices Int8x64) Uint8x64
/* Reciprocal */
/* Select128FromPair */
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
//
// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
// Asm: VPERM2F128, CPU Feature: AVX
func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
//
// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
// Asm: VPERM2F128, CPU Feature: AVX
func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
//
// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
// Asm: VPERM2I128, CPU Feature: AVX2
func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
//
// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
// Asm: VPERM2I128, CPU Feature: AVX2
func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
//
// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
// Asm: VPERM2I128, CPU Feature: AVX2
func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
//
// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
// Asm: VSHUFPD, CPU Feature: AVX512
func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
+/* permuteScalars */
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) permuteScalars(indices uint8) Int32x4
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
+
+/* permuteScalarsGrouped */
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
+
+/* permuteScalarsHi */
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
+
+/* permuteScalarsHiGrouped */
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
+
+/* permuteScalarsLo */
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
+
+/* permuteScalarsLoGrouped */
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+// x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+// x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+// x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+// x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
+
/* tern */
// tern performs a logical operation on three vectors based on the 8-bit truth table.
}
panic("missing case, switch should be exhaustive")
}
+
+/* PermuteScalars */
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
+ return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
+ return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsGrouped */
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
+ return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
+ return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
+ return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
+ return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHi */
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
+ return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
+ return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHiGrouped */
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
+ return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
+// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
+ return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
+ return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// { x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
+// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
+ return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLo */
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
+ return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
+ return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLoGrouped */
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
+ return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
+// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
+ return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
+ return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
+// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
+ return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}