[dev.simd] simd: fix signatures for PermuteConstant* methods

author David Chase <drchase@google.com>

Mon, 17 Nov 2025 20:31:36 +0000 (15:31 -0500)

committer David Chase <drchase@google.com>

Fri, 21 Nov 2025 01:47:32 +0000 (17:47 -0800)
author David Chase <drchase@google.com>
Mon, 17 Nov 2025 20:31:36 +0000 (15:31 -0500)
committer David Chase <drchase@google.com>
Fri, 21 Nov 2025 01:47:32 +0000 (17:47 -0800)
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go

index 3f8ce17972f92a79150464de806bfa6315242b7b..b70a72b2f85f82805f11337da20dcf0f5ae88dd4 100644 (file)
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -396,7 +396,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPOR256,
                 ssa.OpAMD64VPORD512,
                 ssa.OpAMD64VPORQ512,
-               ssa.OpAMD64VPSHUFB128,
+               ssa.OpAMD64VPERMB128,
                 ssa.OpAMD64VPERMB256,
                 ssa.OpAMD64VPERMB512,
                 ssa.OpAMD64VPERMW128,
@@ -410,6 +410,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPERMQ256,
                 ssa.OpAMD64VPERMPD512,
                 ssa.OpAMD64VPERMQ512,
+               ssa.OpAMD64VPSHUFB128,
                 ssa.OpAMD64VPSHUFB256,
                 ssa.OpAMD64VPSHUFB512,
                 ssa.OpAMD64VPROLVD128,
@@ -672,9 +673,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPORQMasked128,
                 ssa.OpAMD64VPORQMasked256,
                 ssa.OpAMD64VPORQMasked512,
-               ssa.OpAMD64VPSHUFBMasked256,
-               ssa.OpAMD64VPSHUFBMasked512,
-               ssa.OpAMD64VPSHUFBMasked128,
+               ssa.OpAMD64VPERMBMasked128,
                 ssa.OpAMD64VPERMBMasked256,
                 ssa.OpAMD64VPERMBMasked512,
                 ssa.OpAMD64VPERMWMasked128,
@@ -688,6 +687,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPERMQMasked256,
                 ssa.OpAMD64VPERMPDMasked512,
                 ssa.OpAMD64VPERMQMasked512,
+               ssa.OpAMD64VPSHUFBMasked256,
+               ssa.OpAMD64VPSHUFBMasked512,
+               ssa.OpAMD64VPSHUFBMasked128,
                 ssa.OpAMD64VPROLVDMasked128,
                 ssa.OpAMD64VPROLVDMasked256,
                 ssa.OpAMD64VPROLVDMasked512,
@@ -1011,12 +1013,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VEXTRACTF64X4256,
                 ssa.OpAMD64VEXTRACTI128128,
                 ssa.OpAMD64VEXTRACTI64X4256,
-               ssa.OpAMD64VPSHUFD128,
-               ssa.OpAMD64VPSHUFD256,
-               ssa.OpAMD64VPSHUFD512,
-               ssa.OpAMD64VPSHUFHW128,
-               ssa.OpAMD64VPSHUFHW256,
-               ssa.OpAMD64VPSHUFHW512,
                 ssa.OpAMD64VPROLD128,
                 ssa.OpAMD64VPROLD256,
                 ssa.OpAMD64VPROLD512,
@@ -1029,6 +1025,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORQ128,
                 ssa.OpAMD64VPRORQ256,
                 ssa.OpAMD64VPRORQ512,
+               ssa.OpAMD64VPSHUFD128,
+               ssa.OpAMD64VPSHUFD256,
+               ssa.OpAMD64VPSHUFD512,
+               ssa.OpAMD64VPSHUFHW128,
+               ssa.OpAMD64VPSHUFHW256,
+               ssa.OpAMD64VPSHUFHW512,
+               ssa.OpAMD64VPSHUFLW128,
+               ssa.OpAMD64VPSHUFLW256,
+               ssa.OpAMD64VPSHUFLW512,
                 ssa.OpAMD64VPSLLW128const,
                 ssa.OpAMD64VPSLLW256const,
                 ssa.OpAMD64VPSLLW512const,
@@ -1070,12 +1075,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VREDUCEPDMasked128,
                 ssa.OpAMD64VREDUCEPDMasked256,
                 ssa.OpAMD64VREDUCEPDMasked512,
-               ssa.OpAMD64VPSHUFDMasked256,
-               ssa.OpAMD64VPSHUFDMasked512,
-               ssa.OpAMD64VPSHUFHWMasked256,
-               ssa.OpAMD64VPSHUFHWMasked512,
-               ssa.OpAMD64VPSHUFHWMasked128,
-               ssa.OpAMD64VPSHUFDMasked128,
                 ssa.OpAMD64VPROLDMasked128,
                 ssa.OpAMD64VPROLDMasked256,
                 ssa.OpAMD64VPROLDMasked512,
@@ -1088,6 +1087,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORQMasked128,
                 ssa.OpAMD64VPRORQMasked256,
                 ssa.OpAMD64VPRORQMasked512,
+               ssa.OpAMD64VPSHUFDMasked256,
+               ssa.OpAMD64VPSHUFDMasked512,
+               ssa.OpAMD64VPSHUFHWMasked256,
+               ssa.OpAMD64VPSHUFHWMasked512,
+               ssa.OpAMD64VPSHUFHWMasked128,
+               ssa.OpAMD64VPSHUFLWMasked256,
+               ssa.OpAMD64VPSHUFLWMasked512,
+               ssa.OpAMD64VPSHUFLWMasked128,
+               ssa.OpAMD64VPSHUFDMasked128,
                 ssa.OpAMD64VPSLLWMasked128const,
                 ssa.OpAMD64VPSLLWMasked256const,
                 ssa.OpAMD64VPSLLWMasked512const,
@@ -1209,6 +1217,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
         case ssa.OpAMD64VPDPWSSD128,
                 ssa.OpAMD64VPDPWSSD256,
                 ssa.OpAMD64VPDPWSSD512,
+               ssa.OpAMD64VPERMI2B128,
+               ssa.OpAMD64VPERMI2B256,
+               ssa.OpAMD64VPERMI2B512,
+               ssa.OpAMD64VPERMI2W128,
+               ssa.OpAMD64VPERMI2W256,
+               ssa.OpAMD64VPERMI2W512,
+               ssa.OpAMD64VPERMI2PS128,
+               ssa.OpAMD64VPERMI2D128,
+               ssa.OpAMD64VPERMI2PS256,
+               ssa.OpAMD64VPERMI2D256,
+               ssa.OpAMD64VPERMI2PS512,
+               ssa.OpAMD64VPERMI2D512,
+               ssa.OpAMD64VPERMI2PD128,
+               ssa.OpAMD64VPERMI2Q128,
+               ssa.OpAMD64VPERMI2PD256,
+               ssa.OpAMD64VPERMI2Q256,
+               ssa.OpAMD64VPERMI2PD512,
+               ssa.OpAMD64VPERMI2Q512,
                 ssa.OpAMD64VPDPBUSD128,
                 ssa.OpAMD64VPDPBUSD256,
                 ssa.OpAMD64VPDPBUSD512,
@@ -1233,24 +1259,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VFMSUBADD213PD128,
                 ssa.OpAMD64VFMSUBADD213PD256,
                 ssa.OpAMD64VFMSUBADD213PD512,
-               ssa.OpAMD64VPERMI2B128,
-               ssa.OpAMD64VPERMI2B256,
-               ssa.OpAMD64VPERMI2B512,
-               ssa.OpAMD64VPERMI2W128,
-               ssa.OpAMD64VPERMI2W256,
-               ssa.OpAMD64VPERMI2W512,
-               ssa.OpAMD64VPERMI2PS128,
-               ssa.OpAMD64VPERMI2D128,
-               ssa.OpAMD64VPERMI2PS256,
-               ssa.OpAMD64VPERMI2D256,
-               ssa.OpAMD64VPERMI2PS512,
-               ssa.OpAMD64VPERMI2D512,
-               ssa.OpAMD64VPERMI2PD128,
-               ssa.OpAMD64VPERMI2Q128,
-               ssa.OpAMD64VPERMI2PD256,
-               ssa.OpAMD64VPERMI2Q256,
-               ssa.OpAMD64VPERMI2PD512,
-               ssa.OpAMD64VPERMI2Q512,
                 ssa.OpAMD64VPSHLDVW128,
                 ssa.OpAMD64VPSHLDVW256,
                 ssa.OpAMD64VPSHLDVW512,
@@ -1316,6 +1324,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPAVGWMasked128Merging,
                 ssa.OpAMD64VPAVGWMasked256Merging,
                 ssa.OpAMD64VPAVGWMasked512Merging,
+               ssa.OpAMD64VPERMI2BMasked128,
+               ssa.OpAMD64VPERMI2BMasked256,
+               ssa.OpAMD64VPERMI2BMasked512,
+               ssa.OpAMD64VPERMI2WMasked128,
+               ssa.OpAMD64VPERMI2WMasked256,
+               ssa.OpAMD64VPERMI2WMasked512,
+               ssa.OpAMD64VPERMI2PSMasked128,
+               ssa.OpAMD64VPERMI2DMasked128,
+               ssa.OpAMD64VPERMI2PSMasked256,
+               ssa.OpAMD64VPERMI2DMasked256,
+               ssa.OpAMD64VPERMI2PSMasked512,
+               ssa.OpAMD64VPERMI2DMasked512,
+               ssa.OpAMD64VPERMI2PDMasked128,
+               ssa.OpAMD64VPERMI2QMasked128,
+               ssa.OpAMD64VPERMI2PDMasked256,
+               ssa.OpAMD64VPERMI2QMasked256,
+               ssa.OpAMD64VPERMI2PDMasked512,
+               ssa.OpAMD64VPERMI2QMasked512,
                 ssa.OpAMD64VPALIGNRMasked256Merging,
                 ssa.OpAMD64VPALIGNRMasked512Merging,
                 ssa.OpAMD64VPALIGNRMasked128Merging,
@@ -1451,24 +1477,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPORQMasked128Merging,
                 ssa.OpAMD64VPORQMasked256Merging,
                 ssa.OpAMD64VPORQMasked512Merging,
-               ssa.OpAMD64VPERMI2BMasked128,
-               ssa.OpAMD64VPERMI2BMasked256,
-               ssa.OpAMD64VPERMI2BMasked512,
-               ssa.OpAMD64VPERMI2WMasked128,
-               ssa.OpAMD64VPERMI2WMasked256,
-               ssa.OpAMD64VPERMI2WMasked512,
-               ssa.OpAMD64VPERMI2PSMasked128,
-               ssa.OpAMD64VPERMI2DMasked128,
-               ssa.OpAMD64VPERMI2PSMasked256,
-               ssa.OpAMD64VPERMI2DMasked256,
-               ssa.OpAMD64VPERMI2PSMasked512,
-               ssa.OpAMD64VPERMI2DMasked512,
-               ssa.OpAMD64VPERMI2PDMasked128,
-               ssa.OpAMD64VPERMI2QMasked128,
-               ssa.OpAMD64VPERMI2PDMasked256,
-               ssa.OpAMD64VPERMI2QMasked256,
-               ssa.OpAMD64VPERMI2PDMasked512,
-               ssa.OpAMD64VPERMI2QMasked512,
                 ssa.OpAMD64VPSHUFBMasked256Merging,
                 ssa.OpAMD64VPSHUFBMasked512Merging,
                 ssa.OpAMD64VPSHUFBMasked128Merging,
@@ -1819,6 +1827,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 p = simdV21load(s, v)
  
         case ssa.OpAMD64VPDPWSSD512load,
+               ssa.OpAMD64VPERMI2PS128load,
+               ssa.OpAMD64VPERMI2D128load,
+               ssa.OpAMD64VPERMI2PS256load,
+               ssa.OpAMD64VPERMI2D256load,
+               ssa.OpAMD64VPERMI2PS512load,
+               ssa.OpAMD64VPERMI2D512load,
+               ssa.OpAMD64VPERMI2PD128load,
+               ssa.OpAMD64VPERMI2Q128load,
+               ssa.OpAMD64VPERMI2PD256load,
+               ssa.OpAMD64VPERMI2Q256load,
+               ssa.OpAMD64VPERMI2PD512load,
+               ssa.OpAMD64VPERMI2Q512load,
                 ssa.OpAMD64VPDPBUSD512load,
                 ssa.OpAMD64VPDPBUSDS512load,
                 ssa.OpAMD64VFMADD213PS128load,
@@ -1839,18 +1859,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VFMSUBADD213PD128load,
                 ssa.OpAMD64VFMSUBADD213PD256load,
                 ssa.OpAMD64VFMSUBADD213PD512load,
-               ssa.OpAMD64VPERMI2PS128load,
-               ssa.OpAMD64VPERMI2D128load,
-               ssa.OpAMD64VPERMI2PS256load,
-               ssa.OpAMD64VPERMI2D256load,
-               ssa.OpAMD64VPERMI2PS512load,
-               ssa.OpAMD64VPERMI2D512load,
-               ssa.OpAMD64VPERMI2PD128load,
-               ssa.OpAMD64VPERMI2Q128load,
-               ssa.OpAMD64VPERMI2PD256load,
-               ssa.OpAMD64VPERMI2Q256load,
-               ssa.OpAMD64VPERMI2PD512load,
-               ssa.OpAMD64VPERMI2Q512load,
                 ssa.OpAMD64VPSHLDVD128load,
                 ssa.OpAMD64VPSHLDVD256load,
                 ssa.OpAMD64VPSHLDVD512load,
@@ -1868,6 +1876,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
         case ssa.OpAMD64VPDPWSSDMasked128load,
                 ssa.OpAMD64VPDPWSSDMasked256load,
                 ssa.OpAMD64VPDPWSSDMasked512load,
+               ssa.OpAMD64VPERMI2PSMasked128load,
+               ssa.OpAMD64VPERMI2DMasked128load,
+               ssa.OpAMD64VPERMI2PSMasked256load,
+               ssa.OpAMD64VPERMI2DMasked256load,
+               ssa.OpAMD64VPERMI2PSMasked512load,
+               ssa.OpAMD64VPERMI2DMasked512load,
+               ssa.OpAMD64VPERMI2PDMasked128load,
+               ssa.OpAMD64VPERMI2QMasked128load,
+               ssa.OpAMD64VPERMI2PDMasked256load,
+               ssa.OpAMD64VPERMI2QMasked256load,
+               ssa.OpAMD64VPERMI2PDMasked512load,
+               ssa.OpAMD64VPERMI2QMasked512load,
                 ssa.OpAMD64VPDPBUSDMasked128load,
                 ssa.OpAMD64VPDPBUSDMasked256load,
                 ssa.OpAMD64VPDPBUSDMasked512load,
@@ -1892,18 +1912,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VFMSUBADD213PDMasked128load,
                 ssa.OpAMD64VFMSUBADD213PDMasked256load,
                 ssa.OpAMD64VFMSUBADD213PDMasked512load,
-               ssa.OpAMD64VPERMI2PSMasked128load,
-               ssa.OpAMD64VPERMI2DMasked128load,
-               ssa.OpAMD64VPERMI2PSMasked256load,
-               ssa.OpAMD64VPERMI2DMasked256load,
-               ssa.OpAMD64VPERMI2PSMasked512load,
-               ssa.OpAMD64VPERMI2DMasked512load,
-               ssa.OpAMD64VPERMI2PDMasked128load,
-               ssa.OpAMD64VPERMI2QMasked128load,
-               ssa.OpAMD64VPERMI2PDMasked256load,
-               ssa.OpAMD64VPERMI2QMasked256load,
-               ssa.OpAMD64VPERMI2PDMasked512load,
-               ssa.OpAMD64VPERMI2QMasked512load,
                 ssa.OpAMD64VPSHLDVDMasked128load,
                 ssa.OpAMD64VPSHLDVDMasked256load,
                 ssa.OpAMD64VPSHLDVDMasked512load,
@@ -2124,7 +2132,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VREDUCEPD128load,
                 ssa.OpAMD64VREDUCEPD256load,
                 ssa.OpAMD64VREDUCEPD512load,
-               ssa.OpAMD64VPSHUFD512load,
                 ssa.OpAMD64VPROLD128load,
                 ssa.OpAMD64VPROLD256load,
                 ssa.OpAMD64VPROLD512load,
@@ -2137,6 +2144,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORQ128load,
                 ssa.OpAMD64VPRORQ256load,
                 ssa.OpAMD64VPRORQ512load,
+               ssa.OpAMD64VPSHUFD512load,
                 ssa.OpAMD64VPSLLD512constload,
                 ssa.OpAMD64VPSLLQ512constload,
                 ssa.OpAMD64VPSRLD512constload,
@@ -2159,9 +2167,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VREDUCEPDMasked128load,
                 ssa.OpAMD64VREDUCEPDMasked256load,
                 ssa.OpAMD64VREDUCEPDMasked512load,
-               ssa.OpAMD64VPSHUFDMasked256load,
-               ssa.OpAMD64VPSHUFDMasked512load,
-               ssa.OpAMD64VPSHUFDMasked128load,
                 ssa.OpAMD64VPROLDMasked128load,
                 ssa.OpAMD64VPROLDMasked256load,
                 ssa.OpAMD64VPROLDMasked512load,
@@ -2174,6 +2179,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORQMasked128load,
                 ssa.OpAMD64VPRORQMasked256load,
                 ssa.OpAMD64VPRORQMasked512load,
+               ssa.OpAMD64VPSHUFDMasked256load,
+               ssa.OpAMD64VPSHUFDMasked512load,
+               ssa.OpAMD64VPSHUFDMasked128load,
                 ssa.OpAMD64VPSLLDMasked128constload,
                 ssa.OpAMD64VPSLLDMasked256constload,
                 ssa.OpAMD64VPSLLDMasked512constload,
@@ -2447,12 +2455,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPOPCNTQMasked128Merging,
                 ssa.OpAMD64VPOPCNTQMasked256Merging,
                 ssa.OpAMD64VPOPCNTQMasked512Merging,
-               ssa.OpAMD64VPSHUFDMasked256Merging,
-               ssa.OpAMD64VPSHUFDMasked512Merging,
-               ssa.OpAMD64VPSHUFHWMasked256Merging,
-               ssa.OpAMD64VPSHUFHWMasked512Merging,
-               ssa.OpAMD64VPSHUFHWMasked128Merging,
-               ssa.OpAMD64VPSHUFDMasked128Merging,
                 ssa.OpAMD64VRCP14PSMasked128Merging,
                 ssa.OpAMD64VRCP14PSMasked256Merging,
                 ssa.OpAMD64VRCP14PSMasked512Merging,
@@ -2483,6 +2485,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VSQRTPDMasked128Merging,
                 ssa.OpAMD64VSQRTPDMasked256Merging,
                 ssa.OpAMD64VSQRTPDMasked512Merging,
+               ssa.OpAMD64VPSHUFDMasked256Merging,
+               ssa.OpAMD64VPSHUFDMasked512Merging,
+               ssa.OpAMD64VPSHUFHWMasked256Merging,
+               ssa.OpAMD64VPSHUFHWMasked512Merging,
+               ssa.OpAMD64VPSHUFHWMasked128Merging,
+               ssa.OpAMD64VPSHUFLWMasked256Merging,
+               ssa.OpAMD64VPSHUFLWMasked512Merging,
+               ssa.OpAMD64VPSHUFLWMasked128Merging,
+               ssa.OpAMD64VPSHUFDMasked128Merging,
                 ssa.OpAMD64VPSLLWMasked128constMerging,
                 ssa.OpAMD64VPSLLWMasked256constMerging,
                 ssa.OpAMD64VPSLLWMasked512constMerging,
@@ -2674,6 +2685,36 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPCOMPRESSQMasked128,
                 ssa.OpAMD64VPCOMPRESSQMasked256,
                 ssa.OpAMD64VPCOMPRESSQMasked512,
+               ssa.OpAMD64VPERMI2BMasked128,
+               ssa.OpAMD64VPERMI2BMasked256,
+               ssa.OpAMD64VPERMI2BMasked512,
+               ssa.OpAMD64VPERMI2WMasked128,
+               ssa.OpAMD64VPERMI2WMasked256,
+               ssa.OpAMD64VPERMI2WMasked512,
+               ssa.OpAMD64VPERMI2PSMasked128,
+               ssa.OpAMD64VPERMI2PSMasked128load,
+               ssa.OpAMD64VPERMI2DMasked128,
+               ssa.OpAMD64VPERMI2DMasked128load,
+               ssa.OpAMD64VPERMI2PSMasked256,
+               ssa.OpAMD64VPERMI2PSMasked256load,
+               ssa.OpAMD64VPERMI2DMasked256,
+               ssa.OpAMD64VPERMI2DMasked256load,
+               ssa.OpAMD64VPERMI2PSMasked512,
+               ssa.OpAMD64VPERMI2PSMasked512load,
+               ssa.OpAMD64VPERMI2DMasked512,
+               ssa.OpAMD64VPERMI2DMasked512load,
+               ssa.OpAMD64VPERMI2PDMasked128,
+               ssa.OpAMD64VPERMI2PDMasked128load,
+               ssa.OpAMD64VPERMI2QMasked128,
+               ssa.OpAMD64VPERMI2QMasked128load,
+               ssa.OpAMD64VPERMI2PDMasked256,
+               ssa.OpAMD64VPERMI2PDMasked256load,
+               ssa.OpAMD64VPERMI2QMasked256,
+               ssa.OpAMD64VPERMI2QMasked256load,
+               ssa.OpAMD64VPERMI2PDMasked512,
+               ssa.OpAMD64VPERMI2PDMasked512load,
+               ssa.OpAMD64VPERMI2QMasked512,
+               ssa.OpAMD64VPERMI2QMasked512load,
                 ssa.OpAMD64VPALIGNRMasked256,
                 ssa.OpAMD64VPALIGNRMasked512,
                 ssa.OpAMD64VPALIGNRMasked128,
@@ -3061,48 +3102,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPORQMasked256load,
                 ssa.OpAMD64VPORQMasked512,
                 ssa.OpAMD64VPORQMasked512load,
-               ssa.OpAMD64VPERMI2BMasked128,
-               ssa.OpAMD64VPERMI2BMasked256,
-               ssa.OpAMD64VPERMI2BMasked512,
-               ssa.OpAMD64VPERMI2WMasked128,
-               ssa.OpAMD64VPERMI2WMasked256,
-               ssa.OpAMD64VPERMI2WMasked512,
-               ssa.OpAMD64VPERMI2PSMasked128,
-               ssa.OpAMD64VPERMI2PSMasked128load,
-               ssa.OpAMD64VPERMI2DMasked128,
-               ssa.OpAMD64VPERMI2DMasked128load,
-               ssa.OpAMD64VPERMI2PSMasked256,
-               ssa.OpAMD64VPERMI2PSMasked256load,
-               ssa.OpAMD64VPERMI2DMasked256,
-               ssa.OpAMD64VPERMI2DMasked256load,
-               ssa.OpAMD64VPERMI2PSMasked512,
-               ssa.OpAMD64VPERMI2PSMasked512load,
-               ssa.OpAMD64VPERMI2DMasked512,
-               ssa.OpAMD64VPERMI2DMasked512load,
-               ssa.OpAMD64VPERMI2PDMasked128,
-               ssa.OpAMD64VPERMI2PDMasked128load,
-               ssa.OpAMD64VPERMI2QMasked128,
-               ssa.OpAMD64VPERMI2QMasked128load,
-               ssa.OpAMD64VPERMI2PDMasked256,
-               ssa.OpAMD64VPERMI2PDMasked256load,
-               ssa.OpAMD64VPERMI2QMasked256,
-               ssa.OpAMD64VPERMI2QMasked256load,
-               ssa.OpAMD64VPERMI2PDMasked512,
-               ssa.OpAMD64VPERMI2PDMasked512load,
-               ssa.OpAMD64VPERMI2QMasked512,
-               ssa.OpAMD64VPERMI2QMasked512load,
-               ssa.OpAMD64VPSHUFDMasked256,
-               ssa.OpAMD64VPSHUFDMasked256load,
-               ssa.OpAMD64VPSHUFDMasked512,
-               ssa.OpAMD64VPSHUFDMasked512load,
-               ssa.OpAMD64VPSHUFHWMasked256,
-               ssa.OpAMD64VPSHUFHWMasked512,
-               ssa.OpAMD64VPSHUFHWMasked128,
-               ssa.OpAMD64VPSHUFDMasked128,
-               ssa.OpAMD64VPSHUFDMasked128load,
-               ssa.OpAMD64VPSHUFBMasked256,
-               ssa.OpAMD64VPSHUFBMasked512,
-               ssa.OpAMD64VPSHUFBMasked128,
+               ssa.OpAMD64VPERMBMasked128,
                 ssa.OpAMD64VPERMBMasked256,
                 ssa.OpAMD64VPERMBMasked512,
                 ssa.OpAMD64VPERMWMasked128,
@@ -3124,6 +3124,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPERMPDMasked512load,
                 ssa.OpAMD64VPERMQMasked512,
                 ssa.OpAMD64VPERMQMasked512load,
+               ssa.OpAMD64VPSHUFBMasked256,
+               ssa.OpAMD64VPSHUFBMasked512,
+               ssa.OpAMD64VPSHUFBMasked128,
                 ssa.OpAMD64VRCP14PSMasked128,
                 ssa.OpAMD64VRCP14PSMasked128load,
                 ssa.OpAMD64VRCP14PSMasked256,
@@ -3418,6 +3421,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VMOVDQU64Masked128,
                 ssa.OpAMD64VMOVDQU64Masked256,
                 ssa.OpAMD64VMOVDQU64Masked512,
+               ssa.OpAMD64VPSHUFDMasked256,
+               ssa.OpAMD64VPSHUFDMasked256load,
+               ssa.OpAMD64VPSHUFDMasked512,
+               ssa.OpAMD64VPSHUFDMasked512load,
+               ssa.OpAMD64VPSHUFHWMasked256,
+               ssa.OpAMD64VPSHUFHWMasked512,
+               ssa.OpAMD64VPSHUFHWMasked128,
+               ssa.OpAMD64VPSHUFLWMasked256,
+               ssa.OpAMD64VPSHUFLWMasked512,
+               ssa.OpAMD64VPSHUFLWMasked128,
+               ssa.OpAMD64VPSHUFDMasked128,
+               ssa.OpAMD64VPSHUFDMasked128load,
                 ssa.OpAMD64VPSLLWMasked128const,
                 ssa.OpAMD64VPSLLWMasked256const,
                 ssa.OpAMD64VPSLLWMasked512const,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index 5a9a1c0bc7453e41351ae1453027709b9609141b..283a2e53cd9ff10a2952e67a8072db7288427e07 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -216,6 +216,36 @@
  (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
  (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
  (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
+(ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
+(ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
+(ConcatPermuteFloat64x2 ...) => (VPERMI2PD128 ...)
+(ConcatPermuteFloat64x4 ...) => (VPERMI2PD256 ...)
+(ConcatPermuteFloat64x8 ...) => (VPERMI2PD512 ...)
+(ConcatPermuteInt8x16 ...) => (VPERMI2B128 ...)
+(ConcatPermuteInt8x32 ...) => (VPERMI2B256 ...)
+(ConcatPermuteInt8x64 ...) => (VPERMI2B512 ...)
+(ConcatPermuteInt16x8 ...) => (VPERMI2W128 ...)
+(ConcatPermuteInt16x16 ...) => (VPERMI2W256 ...)
+(ConcatPermuteInt16x32 ...) => (VPERMI2W512 ...)
+(ConcatPermuteInt32x4 ...) => (VPERMI2D128 ...)
+(ConcatPermuteInt32x8 ...) => (VPERMI2D256 ...)
+(ConcatPermuteInt32x16 ...) => (VPERMI2D512 ...)
+(ConcatPermuteInt64x2 ...) => (VPERMI2Q128 ...)
+(ConcatPermuteInt64x4 ...) => (VPERMI2Q256 ...)
+(ConcatPermuteInt64x8 ...) => (VPERMI2Q512 ...)
+(ConcatPermuteUint8x16 ...) => (VPERMI2B128 ...)
+(ConcatPermuteUint8x32 ...) => (VPERMI2B256 ...)
+(ConcatPermuteUint8x64 ...) => (VPERMI2B512 ...)
+(ConcatPermuteUint16x8 ...) => (VPERMI2W128 ...)
+(ConcatPermuteUint16x16 ...) => (VPERMI2W256 ...)
+(ConcatPermuteUint16x32 ...) => (VPERMI2W512 ...)
+(ConcatPermuteUint32x4 ...) => (VPERMI2D128 ...)
+(ConcatPermuteUint32x8 ...) => (VPERMI2D256 ...)
+(ConcatPermuteUint32x16 ...) => (VPERMI2D512 ...)
+(ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
+(ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
+(ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
  (ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
  (ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
  (ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
@@ -794,7 +824,7 @@
  (PermuteFloat32x16 ...) => (VPERMPS512 ...)
  (PermuteFloat64x4 ...) => (VPERMPD256 ...)
  (PermuteFloat64x8 ...) => (VPERMPD512 ...)
-(PermuteInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteInt8x16 ...) => (VPERMB128 ...)
  (PermuteInt8x32 ...) => (VPERMB256 ...)
  (PermuteInt8x64 ...) => (VPERMB512 ...)
  (PermuteInt16x8 ...) => (VPERMW128 ...)
@@ -804,7 +834,7 @@
  (PermuteInt32x16 ...) => (VPERMD512 ...)
  (PermuteInt64x4 ...) => (VPERMQ256 ...)
  (PermuteInt64x8 ...) => (VPERMQ512 ...)
-(PermuteUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteUint8x16 ...) => (VPERMB128 ...)
  (PermuteUint8x32 ...) => (VPERMB256 ...)
  (PermuteUint8x64 ...) => (VPERMB512 ...)
  (PermuteUint16x8 ...) => (VPERMW128 ...)
@@ -814,62 +844,12 @@
  (PermuteUint32x16 ...) => (VPERMD512 ...)
  (PermuteUint64x4 ...) => (VPERMQ256 ...)
  (PermuteUint64x8 ...) => (VPERMQ512 ...)
-(Permute2Float32x4 ...) => (VPERMI2PS128 ...)
-(Permute2Float32x8 ...) => (VPERMI2PS256 ...)
-(Permute2Float32x16 ...) => (VPERMI2PS512 ...)
-(Permute2Float64x2 ...) => (VPERMI2PD128 ...)
-(Permute2Float64x4 ...) => (VPERMI2PD256 ...)
-(Permute2Float64x8 ...) => (VPERMI2PD512 ...)
-(Permute2Int8x16 ...) => (VPERMI2B128 ...)
-(Permute2Int8x32 ...) => (VPERMI2B256 ...)
-(Permute2Int8x64 ...) => (VPERMI2B512 ...)
-(Permute2Int16x8 ...) => (VPERMI2W128 ...)
-(Permute2Int16x16 ...) => (VPERMI2W256 ...)
-(Permute2Int16x32 ...) => (VPERMI2W512 ...)
-(Permute2Int32x4 ...) => (VPERMI2D128 ...)
-(Permute2Int32x8 ...) => (VPERMI2D256 ...)
-(Permute2Int32x16 ...) => (VPERMI2D512 ...)
-(Permute2Int64x2 ...) => (VPERMI2Q128 ...)
-(Permute2Int64x4 ...) => (VPERMI2Q256 ...)
-(Permute2Int64x8 ...) => (VPERMI2Q512 ...)
-(Permute2Uint8x16 ...) => (VPERMI2B128 ...)
-(Permute2Uint8x32 ...) => (VPERMI2B256 ...)
-(Permute2Uint8x64 ...) => (VPERMI2B512 ...)
-(Permute2Uint16x8 ...) => (VPERMI2W128 ...)
-(Permute2Uint16x16 ...) => (VPERMI2W256 ...)
-(Permute2Uint16x32 ...) => (VPERMI2W512 ...)
-(Permute2Uint32x4 ...) => (VPERMI2D128 ...)
-(Permute2Uint32x8 ...) => (VPERMI2D256 ...)
-(Permute2Uint32x16 ...) => (VPERMI2D512 ...)
-(Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
-(Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
-(Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
-(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
-(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
-(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
-(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
-(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
-(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
-(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
-(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
-(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
-(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
-(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
+(PermuteOrZeroInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteOrZeroUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteOrZeroGroupedInt8x32 ...) => (VPSHUFB256 ...)
+(PermuteOrZeroGroupedInt8x64 ...) => (VPSHUFB512 ...)
+(PermuteOrZeroGroupedUint8x32 ...) => (VPSHUFB256 ...)
+(PermuteOrZeroGroupedUint8x64 ...) => (VPSHUFB512 ...)
  (ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
  (ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
  (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
@@ -1324,6 +1304,24 @@
  (concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...)
  (concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...)
  (concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...)
+(permuteScalarsInt32x4 ...) => (VPSHUFD128 ...)
+(permuteScalarsUint32x4 ...) => (VPSHUFD128 ...)
+(permuteScalarsGroupedInt32x8 ...) => (VPSHUFD256 ...)
+(permuteScalarsGroupedInt32x16 ...) => (VPSHUFD512 ...)
+(permuteScalarsGroupedUint32x8 ...) => (VPSHUFD256 ...)
+(permuteScalarsGroupedUint32x16 ...) => (VPSHUFD512 ...)
+(permuteScalarsHiInt16x8 ...) => (VPSHUFHW128 ...)
+(permuteScalarsHiUint16x8 ...) => (VPSHUFHW128 ...)
+(permuteScalarsHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
+(permuteScalarsHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
+(permuteScalarsHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
+(permuteScalarsHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
+(permuteScalarsLoInt16x8 ...) => (VPSHUFLW128 ...)
+(permuteScalarsLoUint16x8 ...) => (VPSHUFLW128 ...)
+(permuteScalarsLoGroupedInt16x16 ...) => (VPSHUFLW256 ...)
+(permuteScalarsLoGroupedInt16x32 ...) => (VPSHUFLW512 ...)
+(permuteScalarsLoGroupedUint16x16 ...) => (VPSHUFLW256 ...)
+(permuteScalarsLoGroupedUint16x32 ...) => (VPSHUFLW512 ...)
  (ternInt32x4 ...) => (VPTERNLOGD128 ...)
  (ternInt32x8 ...) => (VPTERNLOGD256 ...)
  (ternInt32x16 ...) => (VPTERNLOGD512 ...)
@@ -1417,6 +1415,24 @@
  (VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask)
  (VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask)
  (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
+(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
+(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
+(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
+(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
+(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
+(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
+(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
+(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
+(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
+(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
+(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
+(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
+(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
+(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
+(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
+(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
+(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
+(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
  (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask)
  (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask)
  (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask)
@@ -1668,33 +1684,7 @@
  (VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask)
  (VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask)
  (VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask)
-(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
-(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
-(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
-(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
-(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
-(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
-(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
-(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
-(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
-(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
-(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
-(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
-(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
-(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
-(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
-(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
-(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
-(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
-(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
-(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
-(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
-(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
-(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
-(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
-(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
-(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
-(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
+(VMOVDQU8Masked128 (VPERMB128 x y) mask) => (VPERMBMasked128 x y mask)
  (VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask)
  (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
  (VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask)
@@ -1708,6 +1698,9 @@
  (VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask)
  (VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask)
  (VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask)
+(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
+(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
+(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
  (VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask)
  (VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask)
  (VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask)
@@ -1874,6 +1867,15 @@
  (VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask)
  (VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask)
  (VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask)
+(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
+(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
+(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
+(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
+(VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) => (VPSHUFLWMasked256 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512 [a] x mask)
+(VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) => (VPSHUFLWMasked128 [a] x mask)
+(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
  (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask)
  (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask)
  (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask)
@@ -2021,6 +2023,7 @@
  (VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask)
  (VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask)
  (VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask)
+(VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask)
  (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask)
  (VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask)
  (VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask)
@@ -2170,6 +2173,7 @@
  (VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
  (VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
  (VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
  (VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
  (VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask))
  (VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask))
@@ -2305,6 +2309,7 @@
  (VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
  (VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
  (VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
  (VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
  (VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask))
  (VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask))
@@ -2410,6 +2415,30 @@
  (VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
  (VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
  (VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
+(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
+(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
+(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
+(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
+(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
+(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
+(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
+(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
+(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
+(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
+(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
+(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
  (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
  (VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
  (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
@@ -2636,34 +2665,6 @@
  (VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem)
  (VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem)
  (VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem)
-(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
-(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
-(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
-(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
-(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
-(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
-(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
-(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
-(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
-(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
-(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
-(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
-(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
-(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
  (VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem)
  (VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem)
  (VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem)
@@ -2862,6 +2863,10 @@
  (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
  (VSHUFPS512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
  (VSHUFPD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
  (VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
  (VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
  (VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go

index 674cfb19d6ae077fe526294affb5cdff11cf73d7..404354d38701afc760f113751bb486ed353cf571 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -383,8 +383,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
                 {name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
                 {name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -1310,6 +1312,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPSHUFLW128", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPSHUFLW256", argLength: 1, reg: v11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSHUFLW512", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPSHUFLWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPSHUFLWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSHUFLWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -2392,6 +2400,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
                 {name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
                 {name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPSHUFLWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPSHUFLWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPSHUFLWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
                 {name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
                 {name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
                 {name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index 6a79fa3856343beaabb487608301ba3351f0e492..3fae158c0ae911bb4bf657a89bb0cea1dc786555 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -207,6 +207,36 @@ func simdGenericOps() []opData {
                 {name: "CompressUint64x2", argLength: 2, commutative: false},
                 {name: "CompressUint64x4", argLength: 2, commutative: false},
                 {name: "CompressUint64x8", argLength: 2, commutative: false},
+               {name: "ConcatPermuteFloat32x4", argLength: 3, commutative: false},
+               {name: "ConcatPermuteFloat32x8", argLength: 3, commutative: false},
+               {name: "ConcatPermuteFloat32x16", argLength: 3, commutative: false},
+               {name: "ConcatPermuteFloat64x2", argLength: 3, commutative: false},
+               {name: "ConcatPermuteFloat64x4", argLength: 3, commutative: false},
+               {name: "ConcatPermuteFloat64x8", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt8x16", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt8x32", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt8x64", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt16x8", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt16x16", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt16x32", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt32x4", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt32x8", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt32x16", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt64x2", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt64x4", argLength: 3, commutative: false},
+               {name: "ConcatPermuteInt64x8", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint8x16", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint8x32", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint8x64", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint16x8", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint16x16", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint16x32", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint32x4", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint32x8", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint32x16", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint64x2", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint64x4", argLength: 3, commutative: false},
+               {name: "ConcatPermuteUint64x8", argLength: 3, commutative: false},
                 {name: "ConvertToInt8Int16x8", argLength: 1, commutative: false},
                 {name: "ConvertToInt8Int16x16", argLength: 1, commutative: false},
                 {name: "ConvertToInt8Int16x32", argLength: 1, commutative: false},
@@ -750,44 +780,10 @@ func simdGenericOps() []opData {
                 {name: "OrUint64x2", argLength: 2, commutative: true},
                 {name: "OrUint64x4", argLength: 2, commutative: true},
                 {name: "OrUint64x8", argLength: 2, commutative: true},
-               {name: "Permute2Float32x4", argLength: 3, commutative: false},
-               {name: "Permute2Float32x8", argLength: 3, commutative: false},
-               {name: "Permute2Float32x16", argLength: 3, commutative: false},
-               {name: "Permute2Float64x2", argLength: 3, commutative: false},
-               {name: "Permute2Float64x4", argLength: 3, commutative: false},
-               {name: "Permute2Float64x8", argLength: 3, commutative: false},
-               {name: "Permute2Int8x16", argLength: 3, commutative: false},
-               {name: "Permute2Int8x32", argLength: 3, commutative: false},
-               {name: "Permute2Int8x64", argLength: 3, commutative: false},
-               {name: "Permute2Int16x8", argLength: 3, commutative: false},
-               {name: "Permute2Int16x16", argLength: 3, commutative: false},
-               {name: "Permute2Int16x32", argLength: 3, commutative: false},
-               {name: "Permute2Int32x4", argLength: 3, commutative: false},
-               {name: "Permute2Int32x8", argLength: 3, commutative: false},
-               {name: "Permute2Int32x16", argLength: 3, commutative: false},
-               {name: "Permute2Int64x2", argLength: 3, commutative: false},
-               {name: "Permute2Int64x4", argLength: 3, commutative: false},
-               {name: "Permute2Int64x8", argLength: 3, commutative: false},
-               {name: "Permute2Uint8x16", argLength: 3, commutative: false},
-               {name: "Permute2Uint8x32", argLength: 3, commutative: false},
-               {name: "Permute2Uint8x64", argLength: 3, commutative: false},
-               {name: "Permute2Uint16x8", argLength: 3, commutative: false},
-               {name: "Permute2Uint16x16", argLength: 3, commutative: false},
-               {name: "Permute2Uint16x32", argLength: 3, commutative: false},
-               {name: "Permute2Uint32x4", argLength: 3, commutative: false},
-               {name: "Permute2Uint32x8", argLength: 3, commutative: false},
-               {name: "Permute2Uint32x16", argLength: 3, commutative: false},
-               {name: "Permute2Uint64x2", argLength: 3, commutative: false},
-               {name: "Permute2Uint64x4", argLength: 3, commutative: false},
-               {name: "Permute2Uint64x8", argLength: 3, commutative: false},
                 {name: "PermuteFloat32x8", argLength: 2, commutative: false},
                 {name: "PermuteFloat32x16", argLength: 2, commutative: false},
                 {name: "PermuteFloat64x4", argLength: 2, commutative: false},
                 {name: "PermuteFloat64x8", argLength: 2, commutative: false},
-               {name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
-               {name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
-               {name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
-               {name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
                 {name: "PermuteInt8x16", argLength: 2, commutative: false},
                 {name: "PermuteInt8x32", argLength: 2, commutative: false},
                 {name: "PermuteInt8x64", argLength: 2, commutative: false},
@@ -798,6 +794,12 @@ func simdGenericOps() []opData {
                 {name: "PermuteInt32x16", argLength: 2, commutative: false},
                 {name: "PermuteInt64x4", argLength: 2, commutative: false},
                 {name: "PermuteInt64x8", argLength: 2, commutative: false},
+               {name: "PermuteOrZeroGroupedInt8x32", argLength: 2, commutative: false},
+               {name: "PermuteOrZeroGroupedInt8x64", argLength: 2, commutative: false},
+               {name: "PermuteOrZeroGroupedUint8x32", argLength: 2, commutative: false},
+               {name: "PermuteOrZeroGroupedUint8x64", argLength: 2, commutative: false},
+               {name: "PermuteOrZeroInt8x16", argLength: 2, commutative: false},
+               {name: "PermuteOrZeroUint8x16", argLength: 2, commutative: false},
                 {name: "PermuteUint8x16", argLength: 2, commutative: false},
                 {name: "PermuteUint8x32", argLength: 2, commutative: false},
                 {name: "PermuteUint8x64", argLength: 2, commutative: false},
@@ -1151,28 +1153,6 @@ func simdGenericOps() []opData {
                 {name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
-               {name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
@@ -1292,6 +1272,24 @@ func simdGenericOps() []opData {
                 {name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
                 {name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "permuteScalarsUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"},
                 {name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"},
                 {name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index ea5491362f83a5566cb56c4af530c5af4eda6ea1..fa94dfbbd59b8eb9fba7695174551b20bfede610 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1624,8 +1624,10 @@ const (
         OpAMD64VPDPWSSDMasked128
         OpAMD64VPDPWSSDMasked256
         OpAMD64VPDPWSSDMasked512
+       OpAMD64VPERMB128
         OpAMD64VPERMB256
         OpAMD64VPERMB512
+       OpAMD64VPERMBMasked128
         OpAMD64VPERMBMasked256
         OpAMD64VPERMBMasked512
         OpAMD64VPERMD256
@@ -2551,6 +2553,12 @@ const (
         OpAMD64VPSHUFHWMasked128
         OpAMD64VPSHUFHWMasked256
         OpAMD64VPSHUFHWMasked512
+       OpAMD64VPSHUFLW128
+       OpAMD64VPSHUFLW256
+       OpAMD64VPSHUFLW512
+       OpAMD64VPSHUFLWMasked128
+       OpAMD64VPSHUFLWMasked256
+       OpAMD64VPSHUFLWMasked512
         OpAMD64VPSLLD128const
         OpAMD64VPSLLD256const
         OpAMD64VPSLLD512const
@@ -3633,6 +3641,9 @@ const (
         OpAMD64VPSHUFHWMasked128Merging
         OpAMD64VPSHUFHWMasked256Merging
         OpAMD64VPSHUFHWMasked512Merging
+       OpAMD64VPSHUFLWMasked128Merging
+       OpAMD64VPSHUFLWMasked256Merging
+       OpAMD64VPSHUFLWMasked512Merging
         OpAMD64VPSLLDMasked128constMerging
         OpAMD64VPSLLDMasked256constMerging
         OpAMD64VPSLLDMasked512constMerging
@@ -6155,6 +6166,36 @@ const (
         OpCompressUint64x2
         OpCompressUint64x4
         OpCompressUint64x8
+       OpConcatPermuteFloat32x4
+       OpConcatPermuteFloat32x8
+       OpConcatPermuteFloat32x16
+       OpConcatPermuteFloat64x2
+       OpConcatPermuteFloat64x4
+       OpConcatPermuteFloat64x8
+       OpConcatPermuteInt8x16
+       OpConcatPermuteInt8x32
+       OpConcatPermuteInt8x64
+       OpConcatPermuteInt16x8
+       OpConcatPermuteInt16x16
+       OpConcatPermuteInt16x32
+       OpConcatPermuteInt32x4
+       OpConcatPermuteInt32x8
+       OpConcatPermuteInt32x16
+       OpConcatPermuteInt64x2
+       OpConcatPermuteInt64x4
+       OpConcatPermuteInt64x8
+       OpConcatPermuteUint8x16
+       OpConcatPermuteUint8x32
+       OpConcatPermuteUint8x64
+       OpConcatPermuteUint16x8
+       OpConcatPermuteUint16x16
+       OpConcatPermuteUint16x32
+       OpConcatPermuteUint32x4
+       OpConcatPermuteUint32x8
+       OpConcatPermuteUint32x16
+       OpConcatPermuteUint64x2
+       OpConcatPermuteUint64x4
+       OpConcatPermuteUint64x8
         OpConvertToInt8Int16x8
         OpConvertToInt8Int16x16
         OpConvertToInt8Int16x32
@@ -6698,44 +6739,10 @@ const (
         OpOrUint64x2
         OpOrUint64x4
         OpOrUint64x8
-       OpPermute2Float32x4
-       OpPermute2Float32x8
-       OpPermute2Float32x16
-       OpPermute2Float64x2
-       OpPermute2Float64x4
-       OpPermute2Float64x8
-       OpPermute2Int8x16
-       OpPermute2Int8x32
-       OpPermute2Int8x64
-       OpPermute2Int16x8
-       OpPermute2Int16x16
-       OpPermute2Int16x32
-       OpPermute2Int32x4
-       OpPermute2Int32x8
-       OpPermute2Int32x16
-       OpPermute2Int64x2
-       OpPermute2Int64x4
-       OpPermute2Int64x8
-       OpPermute2Uint8x16
-       OpPermute2Uint8x32
-       OpPermute2Uint8x64
-       OpPermute2Uint16x8
-       OpPermute2Uint16x16
-       OpPermute2Uint16x32
-       OpPermute2Uint32x4
-       OpPermute2Uint32x8
-       OpPermute2Uint32x16
-       OpPermute2Uint64x2
-       OpPermute2Uint64x4
-       OpPermute2Uint64x8
         OpPermuteFloat32x8
         OpPermuteFloat32x16
         OpPermuteFloat64x4
         OpPermuteFloat64x8
-       OpPermuteGroupedInt8x32
-       OpPermuteGroupedInt8x64
-       OpPermuteGroupedUint8x32
-       OpPermuteGroupedUint8x64
         OpPermuteInt8x16
         OpPermuteInt8x32
         OpPermuteInt8x64
@@ -6746,6 +6753,12 @@ const (
         OpPermuteInt32x16
         OpPermuteInt64x4
         OpPermuteInt64x8
+       OpPermuteOrZeroGroupedInt8x32
+       OpPermuteOrZeroGroupedInt8x64
+       OpPermuteOrZeroGroupedUint8x32
+       OpPermuteOrZeroGroupedUint8x64
+       OpPermuteOrZeroInt8x16
+       OpPermuteOrZeroUint8x16
         OpPermuteUint8x16
         OpPermuteUint8x32
         OpPermuteUint8x64
@@ -7099,28 +7112,6 @@ const (
         OpGetElemUint16x8
         OpGetElemUint32x4
         OpGetElemUint64x2
-       OpPermuteConstantGroupedInt32x8
-       OpPermuteConstantGroupedInt32x16
-       OpPermuteConstantGroupedUint32x8
-       OpPermuteConstantGroupedUint32x16
-       OpPermuteConstantHiGroupedInt16x16
-       OpPermuteConstantHiGroupedInt16x32
-       OpPermuteConstantHiGroupedUint16x16
-       OpPermuteConstantHiGroupedUint16x32
-       OpPermuteConstantHiInt16x8
-       OpPermuteConstantHiInt32x4
-       OpPermuteConstantHiUint16x8
-       OpPermuteConstantHiUint32x4
-       OpPermuteConstantInt32x4
-       OpPermuteConstantLoGroupedInt16x16
-       OpPermuteConstantLoGroupedInt16x32
-       OpPermuteConstantLoGroupedUint16x16
-       OpPermuteConstantLoGroupedUint16x32
-       OpPermuteConstantLoInt16x8
-       OpPermuteConstantLoInt32x4
-       OpPermuteConstantLoUint16x8
-       OpPermuteConstantLoUint32x4
-       OpPermuteConstantUint32x4
         OpRotateAllLeftInt32x4
         OpRotateAllLeftInt32x8
         OpRotateAllLeftInt32x16
@@ -7240,6 +7231,24 @@ const (
         OpconcatSelectedConstantInt64x2
         OpconcatSelectedConstantUint32x4
         OpconcatSelectedConstantUint64x2
+       OppermuteScalarsGroupedInt32x8
+       OppermuteScalarsGroupedInt32x16
+       OppermuteScalarsGroupedUint32x8
+       OppermuteScalarsGroupedUint32x16
+       OppermuteScalarsHiGroupedInt16x16
+       OppermuteScalarsHiGroupedInt16x32
+       OppermuteScalarsHiGroupedUint16x16
+       OppermuteScalarsHiGroupedUint16x32
+       OppermuteScalarsHiInt16x8
+       OppermuteScalarsHiUint16x8
+       OppermuteScalarsInt32x4
+       OppermuteScalarsLoGroupedInt16x16
+       OppermuteScalarsLoGroupedInt16x32
+       OppermuteScalarsLoGroupedUint16x16
+       OppermuteScalarsLoGroupedUint16x32
+       OppermuteScalarsLoInt16x8
+       OppermuteScalarsLoUint16x8
+       OppermuteScalarsUint32x4
         OpternInt32x4
         OpternInt32x8
         OpternInt32x16
@@ -26142,6 +26151,20 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMB128",
+               argLen: 2,
+               asm:    x86.AVPERMB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
         {
                 name:   "VPERMB256",
                 argLen: 2,
@@ -26170,6 +26193,21 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMBMasked128",
+               argLen: 3,
+               asm:    x86.AVPERMB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
         {
                 name:   "VPERMBMasked256",
                 argLen: 3,
@@ -39744,6 +39782,93 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:    "VPSHUFLW128",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFLW256",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFLW512",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFLWMasked128",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFLWMasked256",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFLWMasked512",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
         {
                 name:    "VPSLLD128const",
                 auxType: auxUInt8,
@@ -57607,6 +57732,57 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:         "VPSHUFLWMasked128Merging",
+               auxType:      auxUInt8,
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPSHUFLWMasked256Merging",
+               auxType:      auxUInt8,
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPSHUFLWMasked512Merging",
+               auxType:      auxUInt8,
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPSHUFLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
         {
                 name:         "VPSLLDMasked128constMerging",
                 auxType:      auxUInt8,
@@ -86874,6 +87050,156 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "ConcatPermuteFloat32x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteFloat32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteFloat32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteFloat64x2",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteFloat64x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteFloat64x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt8x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt8x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt8x64",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt16x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt16x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt16x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt32x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt64x2",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt64x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteInt64x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint8x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint8x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint8x64",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint16x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint16x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint16x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint32x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint64x2",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint64x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "ConcatPermuteUint64x8",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "ConvertToInt8Int16x8",
                 argLen:  1,
@@ -89758,242 +90084,102 @@ var opcodeTable = [...]opInfo{
                 generic:     true,
         },
         {
-               name:    "Permute2Float32x4",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Float32x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Float32x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Float64x2",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Float64x4",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Float64x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int8x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int8x32",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int8x64",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int16x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int16x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int16x32",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int32x4",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int32x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int32x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int64x2",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int64x4",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Int64x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint8x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint8x32",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint8x64",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint16x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint16x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint16x32",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint32x4",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint32x8",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint32x16",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint64x2",
-               argLen:  3,
-               generic: true,
-       },
-       {
-               name:    "Permute2Uint64x4",
-               argLen:  3,
+               name:    "PermuteFloat32x8",
+               argLen:  2,
                 generic: true,
         },
         {
-               name:    "Permute2Uint64x8",
-               argLen:  3,
+               name:    "PermuteFloat32x16",
+               argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteFloat32x8",
+               name:    "PermuteFloat64x4",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteFloat32x16",
+               name:    "PermuteFloat64x8",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteFloat64x4",
+               name:    "PermuteInt8x16",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteFloat64x8",
+               name:    "PermuteInt8x32",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteGroupedInt8x32",
+               name:    "PermuteInt8x64",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteGroupedInt8x64",
+               name:    "PermuteInt16x8",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteGroupedUint8x32",
+               name:    "PermuteInt16x16",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteGroupedUint8x64",
+               name:    "PermuteInt16x32",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt8x16",
+               name:    "PermuteInt32x8",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt8x32",
+               name:    "PermuteInt32x16",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt8x64",
+               name:    "PermuteInt64x4",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt16x8",
+               name:    "PermuteInt64x8",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt16x16",
+               name:    "PermuteOrZeroGroupedInt8x32",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt16x32",
+               name:    "PermuteOrZeroGroupedInt8x64",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt32x8",
+               name:    "PermuteOrZeroGroupedUint8x32",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt32x16",
+               name:    "PermuteOrZeroGroupedUint8x64",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt64x4",
+               name:    "PermuteOrZeroInt8x16",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "PermuteInt64x8",
+               name:    "PermuteOrZeroUint8x16",
                 argLen:  2,
                 generic: true,
         },
@@ -91830,138 +92016,6 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
-       {
-               name:    "PermuteConstantGroupedInt32x8",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantGroupedInt32x16",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantGroupedUint32x8",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantGroupedUint32x16",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantHiGroupedInt16x16",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantHiGroupedInt16x32",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantHiGroupedUint16x16",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantHiGroupedUint16x32",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantHiInt16x8",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantHiInt32x4",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantHiUint16x8",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantHiUint32x4",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantInt32x4",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantLoGroupedInt16x16",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantLoGroupedInt16x32",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantLoGroupedUint16x16",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantLoGroupedUint16x32",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantLoInt16x8",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantLoInt32x4",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantLoUint16x8",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantLoUint32x4",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
-       {
-               name:    "PermuteConstantUint32x4",
-               auxType: auxUInt8,
-               argLen:  1,
-               generic: true,
-       },
         {
                 name:    "RotateAllLeftInt32x4",
                 auxType: auxUInt8,
@@ -92676,6 +92730,114 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "permuteScalarsGroupedInt32x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsGroupedInt32x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsGroupedUint32x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsGroupedUint32x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsHiGroupedInt16x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsHiGroupedInt16x32",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsHiGroupedUint16x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsHiGroupedUint16x32",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsHiInt16x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsHiUint16x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsInt32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsLoGroupedInt16x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsLoGroupedInt16x32",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsLoGroupedUint16x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsLoGroupedUint16x32",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsLoInt16x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsLoUint16x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "permuteScalarsUint32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
         {
                 name:    "ternInt32x4",
                 auxType: auxUInt8,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index 76e524d524112bba8e45d565b56fe474906d2891..5ad2ed3f96bac4189cdbba79f937d2eb10007adf 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2546,6 +2546,96 @@ func rewriteValueAMD64(v *Value) bool {
                 return rewriteValueAMD64_OpCompressUint8x32(v)
         case OpCompressUint8x64:
                 return rewriteValueAMD64_OpCompressUint8x64(v)
+       case OpConcatPermuteFloat32x16:
+               v.Op = OpAMD64VPERMI2PS512
+               return true
+       case OpConcatPermuteFloat32x4:
+               v.Op = OpAMD64VPERMI2PS128
+               return true
+       case OpConcatPermuteFloat32x8:
+               v.Op = OpAMD64VPERMI2PS256
+               return true
+       case OpConcatPermuteFloat64x2:
+               v.Op = OpAMD64VPERMI2PD128
+               return true
+       case OpConcatPermuteFloat64x4:
+               v.Op = OpAMD64VPERMI2PD256
+               return true
+       case OpConcatPermuteFloat64x8:
+               v.Op = OpAMD64VPERMI2PD512
+               return true
+       case OpConcatPermuteInt16x16:
+               v.Op = OpAMD64VPERMI2W256
+               return true
+       case OpConcatPermuteInt16x32:
+               v.Op = OpAMD64VPERMI2W512
+               return true
+       case OpConcatPermuteInt16x8:
+               v.Op = OpAMD64VPERMI2W128
+               return true
+       case OpConcatPermuteInt32x16:
+               v.Op = OpAMD64VPERMI2D512
+               return true
+       case OpConcatPermuteInt32x4:
+               v.Op = OpAMD64VPERMI2D128
+               return true
+       case OpConcatPermuteInt32x8:
+               v.Op = OpAMD64VPERMI2D256
+               return true
+       case OpConcatPermuteInt64x2:
+               v.Op = OpAMD64VPERMI2Q128
+               return true
+       case OpConcatPermuteInt64x4:
+               v.Op = OpAMD64VPERMI2Q256
+               return true
+       case OpConcatPermuteInt64x8:
+               v.Op = OpAMD64VPERMI2Q512
+               return true
+       case OpConcatPermuteInt8x16:
+               v.Op = OpAMD64VPERMI2B128
+               return true
+       case OpConcatPermuteInt8x32:
+               v.Op = OpAMD64VPERMI2B256
+               return true
+       case OpConcatPermuteInt8x64:
+               v.Op = OpAMD64VPERMI2B512
+               return true
+       case OpConcatPermuteUint16x16:
+               v.Op = OpAMD64VPERMI2W256
+               return true
+       case OpConcatPermuteUint16x32:
+               v.Op = OpAMD64VPERMI2W512
+               return true
+       case OpConcatPermuteUint16x8:
+               v.Op = OpAMD64VPERMI2W128
+               return true
+       case OpConcatPermuteUint32x16:
+               v.Op = OpAMD64VPERMI2D512
+               return true
+       case OpConcatPermuteUint32x4:
+               v.Op = OpAMD64VPERMI2D128
+               return true
+       case OpConcatPermuteUint32x8:
+               v.Op = OpAMD64VPERMI2D256
+               return true
+       case OpConcatPermuteUint64x2:
+               v.Op = OpAMD64VPERMI2Q128
+               return true
+       case OpConcatPermuteUint64x4:
+               v.Op = OpAMD64VPERMI2Q256
+               return true
+       case OpConcatPermuteUint64x8:
+               v.Op = OpAMD64VPERMI2Q512
+               return true
+       case OpConcatPermuteUint8x16:
+               v.Op = OpAMD64VPERMI2B128
+               return true
+       case OpConcatPermuteUint8x32:
+               v.Op = OpAMD64VPERMI2B256
+               return true
+       case OpConcatPermuteUint8x64:
+               v.Op = OpAMD64VPERMI2B512
+               return true
         case OpConcatShiftBytesRightGroupedUint8x32:
                 v.Op = OpAMD64VPALIGNR256
                 return true
@@ -4476,162 +4566,6 @@ func rewriteValueAMD64(v *Value) bool {
         case OpPanicBounds:
                 v.Op = OpAMD64LoweredPanicBoundsRR
                 return true
-       case OpPermute2Float32x16:
-               v.Op = OpAMD64VPERMI2PS512
-               return true
-       case OpPermute2Float32x4:
-               v.Op = OpAMD64VPERMI2PS128
-               return true
-       case OpPermute2Float32x8:
-               v.Op = OpAMD64VPERMI2PS256
-               return true
-       case OpPermute2Float64x2:
-               v.Op = OpAMD64VPERMI2PD128
-               return true
-       case OpPermute2Float64x4:
-               v.Op = OpAMD64VPERMI2PD256
-               return true
-       case OpPermute2Float64x8:
-               v.Op = OpAMD64VPERMI2PD512
-               return true
-       case OpPermute2Int16x16:
-               v.Op = OpAMD64VPERMI2W256
-               return true
-       case OpPermute2Int16x32:
-               v.Op = OpAMD64VPERMI2W512
-               return true
-       case OpPermute2Int16x8:
-               v.Op = OpAMD64VPERMI2W128
-               return true
-       case OpPermute2Int32x16:
-               v.Op = OpAMD64VPERMI2D512
-               return true
-       case OpPermute2Int32x4:
-               v.Op = OpAMD64VPERMI2D128
-               return true
-       case OpPermute2Int32x8:
-               v.Op = OpAMD64VPERMI2D256
-               return true
-       case OpPermute2Int64x2:
-               v.Op = OpAMD64VPERMI2Q128
-               return true
-       case OpPermute2Int64x4:
-               v.Op = OpAMD64VPERMI2Q256
-               return true
-       case OpPermute2Int64x8:
-               v.Op = OpAMD64VPERMI2Q512
-               return true
-       case OpPermute2Int8x16:
-               v.Op = OpAMD64VPERMI2B128
-               return true
-       case OpPermute2Int8x32:
-               v.Op = OpAMD64VPERMI2B256
-               return true
-       case OpPermute2Int8x64:
-               v.Op = OpAMD64VPERMI2B512
-               return true
-       case OpPermute2Uint16x16:
-               v.Op = OpAMD64VPERMI2W256
-               return true
-       case OpPermute2Uint16x32:
-               v.Op = OpAMD64VPERMI2W512
-               return true
-       case OpPermute2Uint16x8:
-               v.Op = OpAMD64VPERMI2W128
-               return true
-       case OpPermute2Uint32x16:
-               v.Op = OpAMD64VPERMI2D512
-               return true
-       case OpPermute2Uint32x4:
-               v.Op = OpAMD64VPERMI2D128
-               return true
-       case OpPermute2Uint32x8:
-               v.Op = OpAMD64VPERMI2D256
-               return true
-       case OpPermute2Uint64x2:
-               v.Op = OpAMD64VPERMI2Q128
-               return true
-       case OpPermute2Uint64x4:
-               v.Op = OpAMD64VPERMI2Q256
-               return true
-       case OpPermute2Uint64x8:
-               v.Op = OpAMD64VPERMI2Q512
-               return true
-       case OpPermute2Uint8x16:
-               v.Op = OpAMD64VPERMI2B128
-               return true
-       case OpPermute2Uint8x32:
-               v.Op = OpAMD64VPERMI2B256
-               return true
-       case OpPermute2Uint8x64:
-               v.Op = OpAMD64VPERMI2B512
-               return true
-       case OpPermuteConstantGroupedInt32x16:
-               v.Op = OpAMD64VPSHUFD512
-               return true
-       case OpPermuteConstantGroupedInt32x8:
-               v.Op = OpAMD64VPSHUFD256
-               return true
-       case OpPermuteConstantGroupedUint32x16:
-               v.Op = OpAMD64VPSHUFD512
-               return true
-       case OpPermuteConstantGroupedUint32x8:
-               v.Op = OpAMD64VPSHUFD256
-               return true
-       case OpPermuteConstantHiGroupedInt16x16:
-               v.Op = OpAMD64VPSHUFHW256
-               return true
-       case OpPermuteConstantHiGroupedInt16x32:
-               v.Op = OpAMD64VPSHUFHW512
-               return true
-       case OpPermuteConstantHiGroupedUint16x16:
-               v.Op = OpAMD64VPSHUFHW256
-               return true
-       case OpPermuteConstantHiGroupedUint16x32:
-               v.Op = OpAMD64VPSHUFHW512
-               return true
-       case OpPermuteConstantHiInt16x8:
-               v.Op = OpAMD64VPSHUFHW128
-               return true
-       case OpPermuteConstantHiInt32x4:
-               v.Op = OpAMD64VPSHUFHW128
-               return true
-       case OpPermuteConstantHiUint16x8:
-               v.Op = OpAMD64VPSHUFHW128
-               return true
-       case OpPermuteConstantHiUint32x4:
-               v.Op = OpAMD64VPSHUFHW128
-               return true
-       case OpPermuteConstantInt32x4:
-               v.Op = OpAMD64VPSHUFD128
-               return true
-       case OpPermuteConstantLoGroupedInt16x16:
-               v.Op = OpAMD64VPSHUFHW256
-               return true
-       case OpPermuteConstantLoGroupedInt16x32:
-               v.Op = OpAMD64VPSHUFHW512
-               return true
-       case OpPermuteConstantLoGroupedUint16x16:
-               v.Op = OpAMD64VPSHUFHW256
-               return true
-       case OpPermuteConstantLoGroupedUint16x32:
-               v.Op = OpAMD64VPSHUFHW512
-               return true
-       case OpPermuteConstantLoInt16x8:
-               v.Op = OpAMD64VPSHUFHW128
-               return true
-       case OpPermuteConstantLoInt32x4:
-               v.Op = OpAMD64VPSHUFHW128
-               return true
-       case OpPermuteConstantLoUint16x8:
-               v.Op = OpAMD64VPSHUFHW128
-               return true
-       case OpPermuteConstantLoUint32x4:
-               v.Op = OpAMD64VPSHUFHW128
-               return true
-       case OpPermuteConstantUint32x4:
-               v.Op = OpAMD64VPSHUFD128
-               return true
         case OpPermuteFloat32x16:
                 v.Op = OpAMD64VPERMPS512
                 return true
@@ -4644,18 +4578,6 @@ func rewriteValueAMD64(v *Value) bool {
         case OpPermuteFloat64x8:
                 v.Op = OpAMD64VPERMPD512
                 return true
-       case OpPermuteGroupedInt8x32:
-               v.Op = OpAMD64VPSHUFB256
-               return true
-       case OpPermuteGroupedInt8x64:
-               v.Op = OpAMD64VPSHUFB512
-               return true
-       case OpPermuteGroupedUint8x32:
-               v.Op = OpAMD64VPSHUFB256
-               return true
-       case OpPermuteGroupedUint8x64:
-               v.Op = OpAMD64VPSHUFB512
-               return true
         case OpPermuteInt16x16:
                 v.Op = OpAMD64VPERMW256
                 return true
@@ -4678,7 +4600,7 @@ func rewriteValueAMD64(v *Value) bool {
                 v.Op = OpAMD64VPERMQ512
                 return true
         case OpPermuteInt8x16:
-               v.Op = OpAMD64VPSHUFB128
+               v.Op = OpAMD64VPERMB128
                 return true
         case OpPermuteInt8x32:
                 v.Op = OpAMD64VPERMB256
@@ -4686,6 +4608,24 @@ func rewriteValueAMD64(v *Value) bool {
         case OpPermuteInt8x64:
                 v.Op = OpAMD64VPERMB512
                 return true
+       case OpPermuteOrZeroGroupedInt8x32:
+               v.Op = OpAMD64VPSHUFB256
+               return true
+       case OpPermuteOrZeroGroupedInt8x64:
+               v.Op = OpAMD64VPSHUFB512
+               return true
+       case OpPermuteOrZeroGroupedUint8x32:
+               v.Op = OpAMD64VPSHUFB256
+               return true
+       case OpPermuteOrZeroGroupedUint8x64:
+               v.Op = OpAMD64VPSHUFB512
+               return true
+       case OpPermuteOrZeroInt8x16:
+               v.Op = OpAMD64VPSHUFB128
+               return true
+       case OpPermuteOrZeroUint8x16:
+               v.Op = OpAMD64VPSHUFB128
+               return true
         case OpPermuteUint16x16:
                 v.Op = OpAMD64VPERMW256
                 return true
@@ -4708,7 +4648,7 @@ func rewriteValueAMD64(v *Value) bool {
                 v.Op = OpAMD64VPERMQ512
                 return true
         case OpPermuteUint8x16:
-               v.Op = OpAMD64VPSHUFB128
+               v.Op = OpAMD64VPERMB128
                 return true
         case OpPermuteUint8x32:
                 v.Op = OpAMD64VPERMB256
@@ -6124,6 +6064,60 @@ func rewriteValueAMD64(v *Value) bool {
         case OpconcatSelectedConstantUint64x2:
                 v.Op = OpAMD64VSHUFPD128
                 return true
+       case OppermuteScalarsGroupedInt32x16:
+               v.Op = OpAMD64VPSHUFD512
+               return true
+       case OppermuteScalarsGroupedInt32x8:
+               v.Op = OpAMD64VPSHUFD256
+               return true
+       case OppermuteScalarsGroupedUint32x16:
+               v.Op = OpAMD64VPSHUFD512
+               return true
+       case OppermuteScalarsGroupedUint32x8:
+               v.Op = OpAMD64VPSHUFD256
+               return true
+       case OppermuteScalarsHiGroupedInt16x16:
+               v.Op = OpAMD64VPSHUFHW256
+               return true
+       case OppermuteScalarsHiGroupedInt16x32:
+               v.Op = OpAMD64VPSHUFHW512
+               return true
+       case OppermuteScalarsHiGroupedUint16x16:
+               v.Op = OpAMD64VPSHUFHW256
+               return true
+       case OppermuteScalarsHiGroupedUint16x32:
+               v.Op = OpAMD64VPSHUFHW512
+               return true
+       case OppermuteScalarsHiInt16x8:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OppermuteScalarsHiUint16x8:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OppermuteScalarsInt32x4:
+               v.Op = OpAMD64VPSHUFD128
+               return true
+       case OppermuteScalarsLoGroupedInt16x16:
+               v.Op = OpAMD64VPSHUFLW256
+               return true
+       case OppermuteScalarsLoGroupedInt16x32:
+               v.Op = OpAMD64VPSHUFLW512
+               return true
+       case OppermuteScalarsLoGroupedUint16x16:
+               v.Op = OpAMD64VPSHUFLW256
+               return true
+       case OppermuteScalarsLoGroupedUint16x32:
+               v.Op = OpAMD64VPSHUFLW512
+               return true
+       case OppermuteScalarsLoInt16x8:
+               v.Op = OpAMD64VPSHUFLW128
+               return true
+       case OppermuteScalarsLoUint16x8:
+               v.Op = OpAMD64VPSHUFLW128
+               return true
+       case OppermuteScalarsUint32x4:
+               v.Op = OpAMD64VPSHUFD128
+               return true
         case OpternInt32x16:
                 v.Op = OpAMD64VPTERNLOGD512
                 return true
@@ -31247,6 +31241,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask)
+       // result: (VPERMI2WMasked128 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2W128 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2WMasked128)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU16Masked128 (VPMOVWB128_128 x) mask)
         // result: (VPMOVWBMasked128_128 x mask)
         for {
@@ -31460,34 +31468,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask)
-       // result: (VPERMI2WMasked128 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2W128 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2WMasked128)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask)
-       // result: (VPSHUFHWMasked128 [a] x mask)
-       for {
-               if v_0.Op != OpAMD64VPSHUFHW128 {
-                       break
-               }
-               a := auxIntToUint8(v_0.AuxInt)
-               x := v_0.Args[0]
-               mask := v_1
-               v.reset(OpAMD64VPSHUFHWMasked128)
-               v.AuxInt = uint8ToAuxInt(a)
-               v.AddArg2(x, mask)
-               return true
-       }
         // match: (VMOVDQU16Masked128 (VPERMW128 x y) mask)
         // result: (VPERMWMasked128 x y mask)
         for {
@@ -31676,6 +31656,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
+       // match: (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask)
+       // result: (VPSHUFHWMasked128 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFHW128 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFHWMasked128)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
+       // match: (VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask)
+       // result: (VPSHUFLWMasked128 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFLW128 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFLWMasked128)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
         // match: (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask)
         // result: (VPSLLWMasked128const [a] x mask)
         for {
@@ -31785,6 +31793,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask)
+       // result: (VPERMI2WMasked256 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2W256 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2WMasked256)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU16Masked256 (VPMOVWB128_256 x) mask)
         // result: (VPMOVWBMasked128_256 x mask)
         for {
@@ -32034,34 +32056,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask)
-       // result: (VPERMI2WMasked256 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2W256 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2WMasked256)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask)
-       // result: (VPSHUFHWMasked256 [a] x mask)
-       for {
-               if v_0.Op != OpAMD64VPSHUFHW256 {
-                       break
-               }
-               a := auxIntToUint8(v_0.AuxInt)
-               x := v_0.Args[0]
-               mask := v_1
-               v.reset(OpAMD64VPSHUFHWMasked256)
-               v.AuxInt = uint8ToAuxInt(a)
-               v.AddArg2(x, mask)
-               return true
-       }
         // match: (VMOVDQU16Masked256 (VPERMW256 x y) mask)
         // result: (VPERMWMasked256 x y mask)
         for {
@@ -32250,6 +32244,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
+       // match: (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask)
+       // result: (VPSHUFHWMasked256 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFHW256 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFHWMasked256)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
+       // match: (VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask)
+       // result: (VPSHUFLWMasked256 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFLW256 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFLWMasked256)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
         // match: (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask)
         // result: (VPSLLWMasked256const [a] x mask)
         for {
@@ -32359,6 +32381,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask)
+       // result: (VPERMI2WMasked512 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2W512 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2WMasked512)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU16Masked512 (VPMOVSXWD512 x) mask)
         // result: (VPMOVSXWDMasked512 x mask)
         for {
@@ -32536,34 +32572,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask)
-       // result: (VPERMI2WMasked512 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2W512 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2WMasked512)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask)
-       // result: (VPSHUFHWMasked512 [a] x mask)
-       for {
-               if v_0.Op != OpAMD64VPSHUFHW512 {
-                       break
-               }
-               a := auxIntToUint8(v_0.AuxInt)
-               x := v_0.Args[0]
-               mask := v_1
-               v.reset(OpAMD64VPSHUFHWMasked512)
-               v.AuxInt = uint8ToAuxInt(a)
-               v.AddArg2(x, mask)
-               return true
-       }
         // match: (VMOVDQU16Masked512 (VPERMW512 x y) mask)
         // result: (VPERMWMasked512 x y mask)
         for {
@@ -32752,6 +32760,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
+       // match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask)
+       // result: (VPSHUFHWMasked512 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFHW512 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFHWMasked512)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
+       // match: (VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask)
+       // result: (VPSHUFLWMasked512 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFLW512 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFLWMasked512)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
         // match: (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask)
         // result: (VPSLLWMasked512const [a] x mask)
         for {
@@ -32875,6 +32911,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask)
+       // result: (VPERMI2PSMasked128 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2PS128 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2PSMasked128)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
+       // match: (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask)
+       // result: (VPERMI2DMasked128 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2D128 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2DMasked128)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU32Masked128 (VPMOVDB128_128 x) mask)
         // result: (VPMOVDBMasked128_128 x mask)
         for {
@@ -33232,48 +33296,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask)
-       // result: (VPERMI2PSMasked128 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2PS128 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2PSMasked128)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask)
-       // result: (VPERMI2DMasked128 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2D128 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2DMasked128)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask)
-       // result: (VPSHUFDMasked128 [a] x mask)
-       for {
-               if v_0.Op != OpAMD64VPSHUFD128 {
-                       break
-               }
-               a := auxIntToUint8(v_0.AuxInt)
-               x := v_0.Args[0]
-               mask := v_1
-               v.reset(OpAMD64VPSHUFDMasked128)
-               v.AuxInt = uint8ToAuxInt(a)
-               v.AddArg2(x, mask)
-               return true
-       }
         // match: (VMOVDQU32Masked128 (VPROLD128 [a] x) mask)
         // result: (VPROLDMasked128 [a] x mask)
         for {
@@ -33515,6 +33537,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
+       // match: (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask)
+       // result: (VPSHUFDMasked128 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFD128 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFDMasked128)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
         // match: (VMOVDQU32Masked128 (VPSLLD128const [a] x) mask)
         // result: (VPSLLDMasked128const [a] x mask)
         for {
@@ -33638,6 +33674,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask)
+       // result: (VPERMI2PSMasked256 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2PS256 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2PSMasked256)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
+       // match: (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask)
+       // result: (VPERMI2DMasked256 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2D256 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2DMasked256)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU32Masked256 (VPMOVDB128_256 x) mask)
         // result: (VPMOVDBMasked128_256 x mask)
         for {
@@ -34031,48 +34095,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask)
-       // result: (VPERMI2PSMasked256 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2PS256 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2PSMasked256)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask)
-       // result: (VPERMI2DMasked256 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2D256 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2DMasked256)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask)
-       // result: (VPSHUFDMasked256 [a] x mask)
-       for {
-               if v_0.Op != OpAMD64VPSHUFD256 {
-                       break
-               }
-               a := auxIntToUint8(v_0.AuxInt)
-               x := v_0.Args[0]
-               mask := v_1
-               v.reset(OpAMD64VPSHUFDMasked256)
-               v.AuxInt = uint8ToAuxInt(a)
-               v.AddArg2(x, mask)
-               return true
-       }
         // match: (VMOVDQU32Masked256 (VPERMPS256 x y) mask)
         // result: (VPERMPSMasked256 x y mask)
         for {
@@ -34340,6 +34362,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
+       // match: (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask)
+       // result: (VPSHUFDMasked256 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFD256 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFDMasked256)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
         // match: (VMOVDQU32Masked256 (VPSLLD256const [a] x) mask)
         // result: (VPSLLDMasked256const [a] x mask)
         for {
@@ -34489,6 +34525,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask)
+       // result: (VPERMI2PSMasked512 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2PS512 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2PSMasked512)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
+       // match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask)
+       // result: (VPERMI2DMasked512 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2D512 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2DMasked512)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU32Masked512 (VPMOVDB128_512 x) mask)
         // result: (VPMOVDBMasked128_512 x mask)
         for {
@@ -34823,48 +34887,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
-       // match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask)
-       // result: (VPERMI2PSMasked512 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2PS512 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2PSMasked512)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask)
-       // result: (VPERMI2DMasked512 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2D512 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2DMasked512)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask)
-       // result: (VPSHUFDMasked512 [a] x mask)
-       for {
-               if v_0.Op != OpAMD64VPSHUFD512 {
-                       break
-               }
-               a := auxIntToUint8(v_0.AuxInt)
-               x := v_0.Args[0]
-               mask := v_1
-               v.reset(OpAMD64VPSHUFDMasked512)
-               v.AuxInt = uint8ToAuxInt(a)
-               v.AddArg2(x, mask)
-               return true
-       }
         // match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask)
         // result: (VPERMPSMasked512 x y mask)
         for {
@@ -35169,6 +35191,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
+       // match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask)
+       // result: (VPSHUFDMasked512 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFD512 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFDMasked512)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
         // match: (VMOVDQU32Masked512 (VPSLLD512const [a] x) mask)
         // result: (VPSLLDMasked512const [a] x mask)
         for {
@@ -35280,6 +35316,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked128(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask)
+       // result: (VPERMI2PDMasked128 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2PD128 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2PDMasked128)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
+       // match: (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask)
+       // result: (VPERMI2QMasked128 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2Q128 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2QMasked128)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU64Masked128 (VPMOVQB128_128 x) mask)
         // result: (VPMOVQBMasked128_128 x mask)
         for {
@@ -35571,34 +35635,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked128(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask)
-       // result: (VPERMI2PDMasked128 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2PD128 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2PDMasked128)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask)
-       // result: (VPERMI2QMasked128 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2Q128 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2QMasked128)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
         // match: (VMOVDQU64Masked128 (VRCP14PD128 x) mask)
         // result: (VRCP14PDMasked128 x mask)
         for {
@@ -35987,6 +36023,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked256(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask)
+       // result: (VPERMI2PDMasked256 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2PD256 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2PDMasked256)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
+       // match: (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask)
+       // result: (VPERMI2QMasked256 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2Q256 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2QMasked256)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU64Masked256 (VPMOVQB128_256 x) mask)
         // result: (VPMOVQBMasked128_256 x mask)
         for {
@@ -36314,34 +36378,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked256(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask)
-       // result: (VPERMI2PDMasked256 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2PD256 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2PDMasked256)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask)
-       // result: (VPERMI2QMasked256 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2Q256 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2QMasked256)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
         // match: (VMOVDQU64Masked256 (VPERMPD256 x y) mask)
         // result: (VPERMPDMasked256 x y mask)
         for {
@@ -36782,6 +36818,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask)
+       // result: (VPERMI2PDMasked512 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2PD512 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2PDMasked512)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
+       // match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask)
+       // result: (VPERMI2QMasked512 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2Q512 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2QMasked512)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU64Masked512 (VPMOVQB128_512 x) mask)
         // result: (VPMOVQBMasked128_512 x mask)
         for {
@@ -37050,34 +37114,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
-       // match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask)
-       // result: (VPERMI2PDMasked512 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2PD512 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2PDMasked512)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
-       // match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask)
-       // result: (VPERMI2QMasked512 x y z mask)
-       for {
-               if v_0.Op != OpAMD64VPERMI2Q512 {
-                       break
-               }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
-               y := v_0.Args[1]
-               mask := v_1
-               v.reset(OpAMD64VPERMI2QMasked512)
-               v.AddArg4(x, y, z, mask)
-               return true
-       }
         // match: (VMOVDQU64Masked512 (VPERMPD512 x y) mask)
         // result: (VPERMPDMasked512 x y mask)
         for {
@@ -37491,6 +37527,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked128(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask)
+       // result: (VPERMI2BMasked128 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2B128 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2BMasked128)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask)
         // result: (VPALIGNRMasked128 [a] x y mask)
         for {
@@ -37685,18 +37735,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked128(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask)
-       // result: (VPERMI2BMasked128 x y z mask)
+       // match: (VMOVDQU8Masked128 (VPERMB128 x y) mask)
+       // result: (VPERMBMasked128 x y mask)
         for {
-               if v_0.Op != OpAMD64VPERMI2B128 {
+               if v_0.Op != OpAMD64VPERMB128 {
                         break
                 }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
                 y := v_0.Args[1]
+               x := v_0.Args[0]
                 mask := v_1
-               v.reset(OpAMD64VPERMI2BMasked128)
-               v.AddArg4(x, y, z, mask)
+               v.reset(OpAMD64VPERMBMasked128)
+               v.AddArg3(x, y, mask)
                 return true
         }
         // match: (VMOVDQU8Masked128 (VPSHUFB128 x y) mask)
@@ -37832,6 +37881,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask)
+       // result: (VPERMI2BMasked256 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2B256 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2BMasked256)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask)
         // result: (VPALIGNRMasked256 [a] x y mask)
         for {
@@ -38026,18 +38089,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask)
-       // result: (VPERMI2BMasked256 x y z mask)
+       // match: (VMOVDQU8Masked256 (VPERMB256 x y) mask)
+       // result: (VPERMBMasked256 x y mask)
         for {
-               if v_0.Op != OpAMD64VPERMI2B256 {
+               if v_0.Op != OpAMD64VPERMB256 {
                         break
                 }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
                 y := v_0.Args[1]
+               x := v_0.Args[0]
                 mask := v_1
-               v.reset(OpAMD64VPERMI2BMasked256)
-               v.AddArg4(x, y, z, mask)
+               v.reset(OpAMD64VPERMBMasked256)
+               v.AddArg3(x, y, mask)
                 return true
         }
         // match: (VMOVDQU8Masked256 (VPSHUFB256 x y) mask)
@@ -38053,19 +38115,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
-       // match: (VMOVDQU8Masked256 (VPERMB256 x y) mask)
-       // result: (VPERMBMasked256 x y mask)
-       for {
-               if v_0.Op != OpAMD64VPERMB256 {
-                       break
-               }
-               y := v_0.Args[1]
-               x := v_0.Args[0]
-               mask := v_1
-               v.reset(OpAMD64VPERMBMasked256)
-               v.AddArg3(x, y, mask)
-               return true
-       }
         // match: (VMOVDQU8Masked256 (VPSUBB256 x y) mask)
         // result: (VPSUBBMasked256 x y mask)
         for {
@@ -38186,6 +38235,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
+       // match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask)
+       // result: (VPERMI2BMasked512 x y z mask)
+       for {
+               if v_0.Op != OpAMD64VPERMI2B512 {
+                       break
+               }
+               z := v_0.Args[2]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               mask := v_1
+               v.reset(OpAMD64VPERMI2BMasked512)
+               v.AddArg4(x, y, z, mask)
+               return true
+       }
         // match: (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask)
         // result: (VPALIGNRMasked512 [a] x y mask)
         for {
@@ -38380,18 +38443,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool {
                 v.AddArg2(x, mask)
                 return true
         }
-       // match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask)
-       // result: (VPERMI2BMasked512 x y z mask)
+       // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask)
+       // result: (VPERMBMasked512 x y mask)
         for {
-               if v_0.Op != OpAMD64VPERMI2B512 {
+               if v_0.Op != OpAMD64VPERMB512 {
                         break
                 }
-               z := v_0.Args[2]
-               x := v_0.Args[0]
                 y := v_0.Args[1]
+               x := v_0.Args[0]
                 mask := v_1
-               v.reset(OpAMD64VPERMI2BMasked512)
-               v.AddArg4(x, y, z, mask)
+               v.reset(OpAMD64VPERMBMasked512)
+               v.AddArg3(x, y, mask)
                 return true
         }
         // match: (VMOVDQU8Masked512 (VPSHUFB512 x y) mask)
@@ -38407,19 +38469,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool {
                 v.AddArg3(x, y, mask)
                 return true
         }
-       // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask)
-       // result: (VPERMBMasked512 x y mask)
-       for {
-               if v_0.Op != OpAMD64VPERMB512 {
-                       break
-               }
-               y := v_0.Args[1]
-               x := v_0.Args[0]
-               mask := v_1
-               v.reset(OpAMD64VPERMBMasked512)
-               v.AddArg3(x, y, mask)
-               return true
-       }
         // match: (VMOVDQU8Masked512 (VPSUBB512 x y) mask)
         // result: (VPSUBBMasked512 x y mask)
         for {
@@ -42642,6 +42691,21 @@ func rewriteValueAMD64_OpAMD64VPBLENDMWMasked512(v *Value) bool {
                 v.AddArg3(dst, x, mask)
                 return true
         }
+       // match: (VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask)
+       // result: (VPSHUFLWMasked512Merging dst [a] x mask)
+       for {
+               dst := v_0
+               if v_1.Op != OpAMD64VPSHUFLW512 {
+                       break
+               }
+               a := auxIntToUint8(v_1.AuxInt)
+               x := v_1.Args[0]
+               mask := v_2
+               v.reset(OpAMD64VPSHUFLWMasked512Merging)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg3(dst, x, mask)
+               return true
+       }
         // match: (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask)
         // result: (VPSLLVWMasked512Merging dst x y mask)
         for {
@@ -45526,6 +45590,27 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool {
                 v.AddArg3(dst, x, v0)
                 return true
         }
+       // match: (VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask)
+       // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+       // result: (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               dst := v_0
+               if v_1.Op != OpAMD64VPSHUFLW128 {
+                       break
+               }
+               a := auxIntToUint8(v_1.AuxInt)
+               x := v_1.Args[0]
+               mask := v_2
+               if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHUFLWMasked128Merging)
+               v.AuxInt = uint8ToAuxInt(a)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(dst, x, v0)
+               return true
+       }
         // match: (VPBLENDVB128 dst (VPSLLD128const [a] x) mask)
         // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
         // result: (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
@@ -48223,6 +48308,27 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool {
                 v.AddArg3(dst, x, v0)
                 return true
         }
+       // match: (VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask)
+       // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+       // result: (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               dst := v_0
+               if v_1.Op != OpAMD64VPSHUFLW256 {
+                       break
+               }
+               a := auxIntToUint8(v_1.AuxInt)
+               x := v_1.Args[0]
+               mask := v_2
+               if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHUFLWMasked256Merging)
+               v.AuxInt = uint8ToAuxInt(a)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(dst, x, v0)
+               return true
+       }
         // match: (VPBLENDVB256 dst (VPSLLD256const [a] x) mask)
         // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
         // result: (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 818b3544aed75ba84357c20d1150bd09b3a1d022..34e491371eae7724f5ca602dd17c234f70c16e71 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -228,6 +228,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
         addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
@@ -802,8 +832,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
@@ -826,62 +856,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
-       addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
-       addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
-       addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
-       addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
-       addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
-       addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
-       addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
-       addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
-       addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
-       addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
-       addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
-       addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
-       addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
-       addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x64, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)
@@ -1300,6 +1280,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
         addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Int32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsInt32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsUint32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Int32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Int16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiInt16x8, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiUint16x8, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Int16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Int16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoInt16x8, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoUint16x8, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Int16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
         addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64)
         addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64)
         addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/gen_simdGenericOps.go b/src/simd/_gen/simdgen/gen_simdGenericOps.go

index 3dbbeb09f7298b839a6c86418328370f5a27dfac..bcbc18b3b2021eb655d0897f167cee3c2c846435 100644 (file)
--- a/src/simd/_gen/simdgen/gen_simdGenericOps.go
+++ b/src/simd/_gen/simdgen/gen_simdGenericOps.go
@@ -46,6 +46,9 @@ func writeSIMDGenericOps(ops []Operation) *bytes.Buffer {
                 if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
                         continue
                 }
+               if op.SkipMaskedMethod() {
+                       continue
+               }
                 _, _, _, immType, gOp := op.shape()
                 gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
                 if immType == VarImm || immType == ConstVarImm {
diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go

index b963fb9abb30693dabe20c2bf75f7c259987529e..04344dc8315b65f73318aa7c2833096a7ebc8e81 100644 (file)
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@@ -107,6 +107,9 @@ func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
                 if op.NoTypes != nil && *op.NoTypes == "true" {
                         continue
                 }
+               if op.SkipMaskedMethod() {
+                       continue
+               }
                 if s, op, err := classifyOp(op); err == nil {
                         if err := t.ExecuteTemplate(buffer, s, op); err != nil {
                                 panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go

index 23b363d38a8385bb96273c16c615a0724f25aeab..dc5f77adaab0cbf5e5159970be029acf509f1acd 100644 (file)
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -604,6 +604,9 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer)
                 if op.NoTypes != nil && *op.NoTypes == "true" {
                         continue
                 }
+               if op.SkipMaskedMethod() {
+                       continue
+               }
                 idxVecAsScalar, err := checkVecAsScalar(op)
                 if err != nil {
                         panic(err)
diff --git a/src/simd/_gen/simdgen/gen_simdrules.go b/src/simd/_gen/simdgen/gen_simdrules.go

index 19393add71abb858d13f447dd5cfd1f0f151efb1..5693496c923f82bceec6a9669f4de24d4d79cda8 100644 (file)
--- a/src/simd/_gen/simdgen/gen_simdrules.go
+++ b/src/simd/_gen/simdgen/gen_simdrules.go
@@ -345,7 +345,8 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
                         data.ArgsOut = "..."
                 }
                 data.tplName = tplName
-               if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" {
+               if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
+                       opr.SkipMaskedMethod() {
                         optData = append(optData, data)
                         continue
                 }
diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go

index 7d3943b4b841b3cacbc5f7e39c6adf11afe9a540..0b8fbd7e3de85a2fa0beccb95a10cb63b747650b 100644 (file)
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@@ -73,6 +73,29 @@ type rawOperation struct {
         NoGenericOps *string
         // If non-nil, this string will be attached to the machine ssa op name.  E.g. "const"
         SSAVariant *string
+       // If true, do not emit method declarations, generic ops, or intrinsics for masked variants
+       // DO emit the architecture-specific opcodes and optimizations.
+       HideMaskMethods *bool
+}
+
+func (o *Operation) IsMasked() bool {
+       if len(o.InVariant) == 0 {
+               return false
+       }
+       if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
+               return true
+       }
+       panic(fmt.Errorf("unknown inVariant"))
+}
+
+func (o *Operation) SkipMaskedMethod() bool {
+       if o.HideMaskMethods == nil {
+               return false
+       }
+       if *o.HideMaskMethods && o.IsMasked() {
+               return true
+       }
+       return false
  }
  
  func (o *Operation) DecodeUnified(v *unify.Value) error {
@@ -80,14 +103,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
                 return err
         }
  
-       isMasked := false
-       if len(o.InVariant) == 0 {
-               // No variant
-       } else if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
-               isMasked = true
-       } else {
-               return fmt.Errorf("unknown inVariant")
-       }
+       isMasked := o.IsMasked()
  
         // Compute full Go method name.
         o.Go = o.rawOperation.Go
@@ -104,6 +120,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
         o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go)
         if isMasked {
                 o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
+               // Suppress generic op and method declaration for exported methods, if a mask is present.
                 if unicode.IsUpper([]rune(o.Go)[0]) {
                         trueVal := "true"
                         o.NoGenericOps = &trueVal
diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml

index bb47819f2fb6e653804b949b494159a8798ee7f4..44bd8efb7fdbe00b671b39c70dd514593a0d0dc1 100644 (file)
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@@ -27,18 +27,22 @@
    constImm: 1
    documentation: !string |-
      // NAME returns the upper half of x.
+- go: PermuteOrZero
+  commutative: false
+  documentation: !string |-
+    // NAME performs a full permutation of vector x using indices:
+    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
  - go: Permute
    commutative: false
    documentation: !string |-
      // NAME performs a full permutation of vector x using indices:
      // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-    // Only the needed bits to represent x's index are used in indices' elements.
-- go: Permute2 # Permute2 is only available on or after AVX512
+- go: ConcatPermute # ConcatPermute is only available on or after AVX512
    commutative: false
    documentation: !string |-
      // NAME performs a full permutation of vector x, y using indices:
      // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-    // where xy is x appending y.
+    // where xy is the concatenation of x (lower half) and y (upper half).
      // Only the needed bits to represent xy's index are used in indices' elements.
  - go: Compress
    commutative: false
@@ -74,31 +78,35 @@
    documentation: !string |-
      // NAME copies element zero of its (128-bit) input to all elements of
      // the 512-bit output vector.
+- go: PermuteOrZeroGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using indices:
  - go: PermuteGrouped
    commutative: false
    documentation: !string |- # Detailed documentation will rely on the specific ops.
      // NAME performs a grouped permutation of vector x using indices:
-- go: PermuteConstant
+- go: permuteScalars
    commutative: false
    documentation: !string |- # Detailed documentation will rely on the specific ops.
      // NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantGrouped
+- go: permuteScalarsGrouped
    commutative: false
    documentation: !string |- # Detailed documentation will rely on the specific ops.
      // NAME performs a grouped permutation of vector x using constant indices:
-- go: PermuteConstantLo
+- go: permuteScalarsLo
    commutative: false
    documentation: !string |- # Detailed documentation will rely on the specific ops.
      // NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantLoGrouped
+- go: permuteScalarsLoGrouped
    commutative: false
    documentation: !string |- # Detailed documentation will rely on the specific ops.
      // NAME performs a grouped permutation of vector x using constant indices:
-- go: PermuteConstantHi
+- go: permuteScalarsHi
    commutative: false
    documentation: !string |- # Detailed documentation will rely on the specific ops.
      // NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantHiGrouped
+- go: permuteScalarsHiGrouped
    commutative: false
    documentation: !string |- # Detailed documentation will rely on the specific ops.
      // NAME performs a grouped permutation of vector x using constant indices:
@@ -218,8 +226,10 @@
  - go: Select128FromPair
    commutative: false
    documentation: !string |-
-    // NAME selects the low and high 128-bit halves from the 128-bit halves
-    // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+    // NAME treats the 256-bit vectors x and y as a single vector of four
+    // 128-bit elements, and returns a 256-bit result formed by 
+    // concatenating the two elements specified by lo and hi.
+    // For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}.
  
  - go: ConcatShiftBytesRight
    commutative: false
diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml

index 75fbc532b8cc302f8907dff471efdd9b091d7b2f..697d6a8bced40b5637b0848d21dd7df6b596e452 100644 (file)
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@@ -213,19 +213,75 @@
    - *f64xN
  
  - go: Permute
-  asm: "VPERM[BWDQ]|VPERMP[SD]"
+  asm: "VPERMQ|VPERMPD"
+  addDoc: !string |-
+    // The low 2 bits (values 0-3) of each element of indices is used
    operandOrder: "21Type1"
    in:
    - &anyindices
      go: $t
      name: indices
      overwriteBase: uint
+  - &any4
+    go: $t
+    lanes: 4
+  out:
    - &any
      go: $t
+
+- go: Permute
+  asm: "VPERM[WDQ]|VPERMP[SD]"
+  addDoc: !string |-
+    // The low 3 bits (values 0-7) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any8
+    go: $t
+    lanes: 8
+  out:
+  - *any
+
+- go: Permute
+  asm: "VPERM[BWD]|VPERMPS"
+  addDoc: !string |-
+    // The low 4 bits (values 0-15) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any16
+    go: $t
+    lanes: 16
    out:
    - *any
  
-- go: Permute2
+- go: Permute
+  asm: "VPERM[BW]"
+  addDoc: !string |-
+    // The low 5 bits (values 0-31) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any32
+    go: $t
+    lanes: 32
+  out:
+  - *any
+
+- go: Permute
+  asm: "VPERMB"
+  addDoc: !string |-
+    // The low 6 bits (values 0-63) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any64
+    go: $t
+    lanes: 64
+  out:
+  - *any
+
+- go: ConcatPermute
    asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
    # Because we are overwriting the receiver's type, we
    # have to move the receiver to be a parameter so that
@@ -403,113 +459,137 @@
      base: $b
  
  # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
-- go: Permute
+- go: PermuteOrZero
    asm: VPSHUFB
    addDoc: !string |-
-    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // The lower four bits of each byte-sized index in indices select an element from x,
+    // unless the index's sign bit is set in which case zero is used instead.
    in:
    - &128any
      bits: 128
      go: $t
    - bits: 128
-    go: $t
      name: indices
+    base: int # always signed
    out:
    - *128any
-- go: PermuteGrouped
+
+- go: PermuteOrZeroGrouped
    asm: VPSHUFB
    addDoc: !string |-
-    // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-    // Only the needed bits to represent the index of a group of x are used in indices' elements.
-    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+    // unless the index's sign bit is set in which case zero is used instead.
      // Each group is of size 128-bit.
    in:
    - &256Or512any
      bits: "256|512"
      go: $t
    - bits: "256|512"
-    go: $t
+    base: int
      name: indices
    out:
    - *256Or512any
  
-- go: PermuteConstant
+- go: permuteScalars
    asm: VPSHUFD
    addDoc: !string |-
-    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    in:
    - *128any
    - class: immediate
      immOffset: 0
      name: indices
+  hideMaskMethods: true
    out:
    - *128any
-- go: PermuteConstantGrouped
+
+- go: permuteScalarsGrouped
    asm: VPSHUFD
    addDoc: !string |-
-    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
      // Each group is of size 128-bit.
    in:
    - *256Or512any
    - class: immediate
      immOffset: 0
      name: indices
+  hideMaskMethods: true
    out:
    - *256Or512any
  
-- go: PermuteConstantLo
-  asm: VPSHUFHW
+- go: permuteScalarsLo
+  asm: VPSHUFLW
    addDoc: !string |-
-    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    in:
-    - *128any
+    - &128lanes8
+      bits: 128
+      go: $t
+      elemBits: 16
      - class: immediate
        immOffset: 0
        name: indices
+  hideMaskMethods: true
    out:
-    - *128any
-- go: PermuteConstantLoGrouped
-  asm: VPSHUFHW
+    - *128lanes8
+
+- go: permuteScalarsLoGrouped
+  asm: VPSHUFLW
    addDoc: !string |-
-    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    //
+    //   result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+    //    x_group1[indices[0:2]], ...}
+    //
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
      // Each group is of size 128-bit.
    in:
-  - *256Or512any
+  - &256Or512lanes8
+    bits: "256|512"
+    go: $t
+    elemBits: 16
    - class: immediate
      immOffset: 0
      name: indices
+  hideMaskMethods: true
    out:
-  - *256Or512any
+  - *256Or512lanes8
  
-- go: PermuteConstantHi
+- go: permuteScalarsHi
    asm: VPSHUFHW
    addDoc: !string |-
-    // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    in:
-  - *128any
+  - *128lanes8
    - class: immediate
      immOffset: 0
      name: indices
+  hideMaskMethods: true
    out:
-  - *128any
-- go: PermuteConstantHiGrouped
+  - *128lanes8
+
+- go: permuteScalarsHiGrouped
    asm: VPSHUFHW
    addDoc: !string |-
-    // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result =
+    //
+    //   {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+    //    x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+    //
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
      // Each group is of size 128-bit.
    in:
-  - *256Or512any
+  - *256Or512lanes8
    - class: immediate
      immOffset: 0
      name: indices
+  hideMaskMethods: true
    out:
-  - *256Or512any
+  - *256Or512lanes8
  
  - go: InterleaveHi
    asm: VPUNPCKH(QDQ|DQ|WD|WB)
diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go

index 2d7793ef0517d9da5f05e1b4d5b515eec5a1b741..f51e3dc15f2364987a4675ad2cc9c7b58fd40c95 100644 (file)
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@@ -163,7 +163,20 @@ func TestPermute(t *testing.T) {
         }
  }
  
-func TestPermute2(t *testing.T) {
+func TestPermuteOrZero(t *testing.T) {
+       x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+       indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
+       want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
+       got := make([]uint8, len(x))
+       simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
+       for i := range 8 {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
+
+func TestConcatPermute(t *testing.T) {
         if !simd.X86.AVX512() {
                 t.Skip("Test requires X86.AVX512, not available on this hardware")
                 return
@@ -173,7 +186,7 @@ func TestPermute2(t *testing.T) {
         indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
         want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
         got := make([]int64, 8)
-       simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+       simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
         for i := range 8 {
                 if want[i] != got[i] {
                         t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
@@ -1161,3 +1174,75 @@ func TestDotProductQuadruple(t *testing.T) {
                 }
         }
  }
+
+func TestPermuteScalars(t *testing.T) {
+       x := []int32{11, 12, 13, 14}
+       want := []int32{12, 13, 14, 11}
+       got := make([]int32, 4)
+       simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
+       for i := range 4 {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
+
+func TestPermuteScalarsGrouped(t *testing.T) {
+       x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
+       want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
+       got := make([]int32, 8)
+       simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
+       for i := range 8 {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
+
+func TestPermuteScalarsHi(t *testing.T) {
+       x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
+       want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
+       got := make([]int16, len(x))
+       simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
+       for i := range got {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
+
+func TestPermuteScalarsLo(t *testing.T) {
+       x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
+       want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
+       got := make([]int16, len(x))
+       simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
+       for i := range got {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
+
+func TestPermuteScalarsHiGrouped(t *testing.T) {
+       x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
+       want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
+       got := make([]int16, len(x))
+       simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
+       for i := range got {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
+
+func TestPermuteScalarsLoGrouped(t *testing.T) {
+       x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
+       want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
+       got := make([]int16, len(x))
+       simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
+       for i := range got {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go

index e06d1f652ebf7f177f8e6e960448061aa84cf0b2..e9ddb463be7dde9bb32a143d82147e9b280b6152 100644 (file)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -1272,6 +1272,248 @@ func (x Uint64x4) Compress(mask Mask64x4) Uint64x4
  // Asm: VPCOMPRESSQ, CPU Feature: AVX512
  func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
  
+/* ConcatPermute */
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x16) ConcatPermute(y Int8x16, indices Uint8x16) Int8x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x16) ConcatPermute(y Uint8x16, indices Uint8x16) Uint8x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x32) ConcatPermute(y Int8x32, indices Uint8x32) Int8x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x32) ConcatPermute(y Uint8x32, indices Uint8x32) Uint8x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x64) ConcatPermute(y Int8x64, indices Uint8x64) Int8x64
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x64) ConcatPermute(y Uint8x64, indices Uint8x64) Uint8x64
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x8) ConcatPermute(y Int16x8, indices Uint16x8) Int16x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x8) ConcatPermute(y Uint16x8, indices Uint16x8) Uint16x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x16) ConcatPermute(y Int16x16, indices Uint16x16) Int16x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x16) ConcatPermute(y Uint16x16, indices Uint16x16) Uint16x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x32) ConcatPermute(y Int16x32, indices Uint16x32) Int16x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x32) ConcatPermute(y Uint16x32, indices Uint16x32) Uint16x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x4) ConcatPermute(y Float32x4, indices Uint32x4) Float32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x4) ConcatPermute(y Int32x4, indices Uint32x4) Int32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x4) ConcatPermute(y Uint32x4, indices Uint32x4) Uint32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x8) ConcatPermute(y Float32x8, indices Uint32x8) Float32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x8) ConcatPermute(y Int32x8, indices Uint32x8) Int32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x8) ConcatPermute(y Uint32x8, indices Uint32x8) Uint32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x16) ConcatPermute(y Float32x16, indices Uint32x16) Float32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x16) ConcatPermute(y Int32x16, indices Uint32x16) Int32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x16) ConcatPermute(y Uint32x16, indices Uint32x16) Uint32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x2) ConcatPermute(y Float64x2, indices Uint64x2) Float64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x2) ConcatPermute(y Int64x2, indices Uint64x2) Int64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x2) ConcatPermute(y Uint64x2, indices Uint64x2) Uint64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x4) ConcatPermute(y Float64x4, indices Uint64x4) Float64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x4) ConcatPermute(y Int64x4, indices Uint64x4) Int64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x4) ConcatPermute(y Uint64x4, indices Uint64x4) Uint64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x8) ConcatPermute(y Float64x8, indices Uint64x8) Float64x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x8) ConcatPermute(y Int64x8, indices Uint64x8) Int64x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8
+
  /* ConcatShiftBytesRight */
  
  // ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes.
@@ -4551,675 +4793,227 @@ func (x Uint64x8) Or(y Uint64x8) Uint64x8
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// The low 4 bits (values 0-15) of each element of indices is used
  //
-// Asm: VPSHUFB, CPU Feature: AVX
-func (x Int8x16) Permute(indices Int8x16) Int8x16
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute(indices Uint8x16) Int8x16
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// The low 4 bits (values 0-15) of each element of indices is used
  //
-// Asm: VPSHUFB, CPU Feature: AVX
+// Asm: VPERMB, CPU Feature: AVX512VBMI
  func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
  //
  // Asm: VPERMB, CPU Feature: AVX512VBMI
  func (x Int8x32) Permute(indices Uint8x32) Int8x32
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
  //
  // Asm: VPERMB, CPU Feature: AVX512VBMI
  func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 6 bits (values 0-63) of each element of indices is used
  //
  // Asm: VPERMB, CPU Feature: AVX512VBMI
  func (x Int8x64) Permute(indices Uint8x64) Int8x64
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 6 bits (values 0-63) of each element of indices is used
  //
  // Asm: VPERMB, CPU Feature: AVX512VBMI
  func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
  //
  // Asm: VPERMW, CPU Feature: AVX512
  func (x Int16x8) Permute(indices Uint16x8) Int16x8
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
  //
  // Asm: VPERMW, CPU Feature: AVX512
  func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
  //
  // Asm: VPERMW, CPU Feature: AVX512
  func (x Int16x16) Permute(indices Uint16x16) Int16x16
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
  //
  // Asm: VPERMW, CPU Feature: AVX512
  func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
  //
  // Asm: VPERMW, CPU Feature: AVX512
  func (x Int16x32) Permute(indices Uint16x32) Int16x32
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
  //
  // Asm: VPERMW, CPU Feature: AVX512
  func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
  //
  // Asm: VPERMPS, CPU Feature: AVX2
  func (x Float32x8) Permute(indices Uint32x8) Float32x8
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
  //
  // Asm: VPERMD, CPU Feature: AVX2
  func (x Int32x8) Permute(indices Uint32x8) Int32x8
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
  //
  // Asm: VPERMD, CPU Feature: AVX2
  func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
  //
  // Asm: VPERMPS, CPU Feature: AVX512
  func (x Float32x16) Permute(indices Uint32x16) Float32x16
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
  //
  // Asm: VPERMD, CPU Feature: AVX512
  func (x Int32x16) Permute(indices Uint32x16) Int32x16
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
  //
  // Asm: VPERMD, CPU Feature: AVX512
  func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
  //
  // Asm: VPERMPD, CPU Feature: AVX512
  func (x Float64x4) Permute(indices Uint64x4) Float64x4
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
  //
  // Asm: VPERMQ, CPU Feature: AVX512
  func (x Int64x4) Permute(indices Uint64x4) Int64x4
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
  //
  // Asm: VPERMQ, CPU Feature: AVX512
  func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
  //
  // Asm: VPERMPD, CPU Feature: AVX512
  func (x Float64x8) Permute(indices Uint64x8) Float64x8
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
  //
  // Asm: VPERMQ, CPU Feature: AVX512
  func (x Int64x8) Permute(indices Uint64x8) Int64x8
  
  // Permute performs a full permutation of vector x using indices:
  // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
  //
  // Asm: VPERMQ, CPU Feature: AVX512
  func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
  
-/* Permute2 */
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x16) Permute2(y Int8x16, indices Uint8x16) Int8x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x16) Permute2(y Uint8x16, indices Uint8x16) Uint8x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x32) Permute2(y Int8x32, indices Uint8x32) Int8x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x32) Permute2(y Uint8x32, indices Uint8x32) Uint8x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x64) Permute2(y Int8x64, indices Uint8x64) Int8x64
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x64) Permute2(y Uint8x64, indices Uint8x64) Uint8x64
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x8) Permute2(y Int16x8, indices Uint16x8) Int16x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x8) Permute2(y Uint16x8, indices Uint16x8) Uint16x8
+/* PermuteOrZero */
  
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x16) Permute2(y Int16x16, indices Uint16x16) Int16x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x16) Permute2(y Uint16x16, indices Uint16x16) Uint16x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x32) Permute2(y Int16x32, indices Uint16x32) Int16x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x32) Permute2(y Uint16x32, indices Uint16x32) Uint16x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x4) Permute2(y Float32x4, indices Uint32x4) Float32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x4) Permute2(y Int32x4, indices Uint32x4) Int32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x4) Permute2(y Uint32x4, indices Uint32x4) Uint32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x8) Permute2(y Float32x8, indices Uint32x8) Float32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x8) Permute2(y Int32x8, indices Uint32x8) Int32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x8) Permute2(y Uint32x8, indices Uint32x8) Uint32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x16) Permute2(y Float32x16, indices Uint32x16) Float32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x16) Permute2(y Int32x16, indices Uint32x16) Int32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x16) Permute2(y Uint32x16, indices Uint32x16) Uint32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x2) Permute2(y Float64x2, indices Uint64x2) Float64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x2) Permute2(y Int64x2, indices Uint64x2) Int64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x2) Permute2(y Uint64x2, indices Uint64x2) Uint64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x4) Permute2(y Float64x4, indices Uint64x4) Float64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x4) Permute2(y Int64x4, indices Uint64x4) Int64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x4) Permute2(y Uint64x4, indices Uint64x4) Uint64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x8) Permute2(y Float64x8, indices Uint64x8) Float64x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
-
-/* PermuteConstant */
-
-// PermuteConstant performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Int32x4) PermuteConstant(indices uint8) Int32x4
-
-// PermuteConstant performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4
-
-/* PermuteConstantGrouped */
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16
-
-/* PermuteConstantHi */
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4
-
-/* PermuteConstantHiGrouped */
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32
-
-/* PermuteConstantLo */
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4
-
-/* PermuteConstantLoGrouped */
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// PermuteOrZero performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The lower four bits of each byte-sized index in indices select an element from x,
+// unless the index's sign bit is set in which case zero is used instead.
  //
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Int8x16) PermuteOrZero(indices Int8x16) Int8x16
  
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// PermuteOrZero performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The lower four bits of each byte-sized index in indices select an element from x,
+// unless the index's sign bit is set in which case zero is used instead.
  //
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Uint8x16) PermuteOrZero(indices Int8x16) Uint8x16
  
-/* PermuteGrouped */
+/* PermuteOrZeroGrouped */
  
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
  // Each group is of size 128-bit.
  //
  // Asm: VPSHUFB, CPU Feature: AVX2
-func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32
+func (x Int8x32) PermuteOrZeroGrouped(indices Int8x32) Int8x32
  
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
  // Each group is of size 128-bit.
  //
  // Asm: VPSHUFB, CPU Feature: AVX512
-func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64
+func (x Int8x64) PermuteOrZeroGrouped(indices Int8x64) Int8x64
  
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
  // Each group is of size 128-bit.
  //
  // Asm: VPSHUFB, CPU Feature: AVX2
-func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32
+func (x Uint8x32) PermuteOrZeroGrouped(indices Int8x32) Uint8x32
  
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
  // Each group is of size 128-bit.
  //
  // Asm: VPSHUFB, CPU Feature: AVX512
-func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64
+func (x Uint8x64) PermuteOrZeroGrouped(indices Int8x64) Uint8x64
  
  /* Reciprocal */
  
@@ -5807,8 +5601,10 @@ func (x Float64x8) Scale(y Float64x8) Float64x8
  
  /* Select128FromPair */
  
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
  // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5816,8 +5612,10 @@ func (x Float64x8) Scale(y Float64x8) Float64x8
  // Asm: VPERM2F128, CPU Feature: AVX
  func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
  
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
  // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5825,8 +5623,10 @@ func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
  // Asm: VPERM2F128, CPU Feature: AVX
  func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
  
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
  // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5834,8 +5634,10 @@ func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
  // Asm: VPERM2I128, CPU Feature: AVX2
  func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
  
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
  // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5843,8 +5645,10 @@ func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
  // Asm: VPERM2I128, CPU Feature: AVX2
  func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
  
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
  // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5852,8 +5656,10 @@ func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
  // Asm: VPERM2I128, CPU Feature: AVX2
  func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
  
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
  //
  // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
  // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
diff --git a/src/simd/ops_internal_amd64.go b/src/simd/ops_internal_amd64.go

index 8be40995f0cc999db7a910786f09d8e191e35134..63ee6416a66cf7070a26ec8fabd8a7bb8a0888b9 100644 (file)
--- a/src/simd/ops_internal_amd64.go
+++ b/src/simd/ops_internal_amd64.go
@@ -338,6 +338,220 @@ func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x
  // Asm: VSHUFPD, CPU Feature: AVX512
  func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
  
+/* permuteScalars */
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) permuteScalars(indices uint8) Int32x4
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
+
+/* permuteScalarsGrouped */
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
+
+/* permuteScalarsHi */
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
+
+/* permuteScalarsHiGrouped */
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//     {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//      x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//     {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//      x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//     {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//      x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//     {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//      x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
+
+/* permuteScalarsLo */
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
+
+/* permuteScalarsLoGrouped */
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//     result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//      x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//     result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//      x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//     result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//      x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//     result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//      x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
+
  /* tern */
  
  // tern performs a logical operation on three vectors based on the 8-bit truth table.
diff --git a/src/simd/shuffles_amd64.go b/src/simd/shuffles_amd64.go

index e0d9db9266e1e4992d9d31af190c0cb067b0c9f9..b7472f70200c104a0c0895ee8d7f13593e5cee75 100644 (file)
--- a/src/simd/shuffles_amd64.go
+++ b/src/simd/shuffles_amd64.go
@@ -989,3 +989,280 @@ func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
         }
         panic("missing case, switch should be exhaustive")
  }
+
+/* PermuteScalars */
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+//     result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
+       return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+//     result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
+       return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsGrouped */
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//     result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
+       return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//              {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
+//                     x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
+       return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//     result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
+       return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//              {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
+//                     x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
+       return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHi */
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
+       return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
+       return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHiGrouped */
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//               {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
+//                     x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
+       return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//               {x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
+//                     x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
+//                     x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+//                     x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
+       return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//       {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
+//             x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
+       return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//              {  x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
+//                     x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
+//                     x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+//                     x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
+       return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLo */
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+//     result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
+       return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+//     result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
+       return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLoGrouped */
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//      {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
+//              x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
+       return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//      {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
+//             x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
+//             x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+//             x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
+       return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result = {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
+//             x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
+       return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//      result =
+//      {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
+//             x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
+//             x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+//             x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
+       return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
author	David Chase <drchase@google.com>
	Mon, 17 Nov 2025 20:31:36 +0000 (15:31 -0500)
committer	David Chase <drchase@google.com>
	Fri, 21 Nov 2025 01:47:32 +0000 (17:47 -0800)
src/cmd/compile/internal/amd64/simdssa.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/simd/_gen/simdgen/gen_simdGenericOps.go		patch \| blob \| history
src/simd/_gen/simdgen/gen_simdIntrinsics.go		patch \| blob \| history
src/simd/_gen/simdgen/gen_simdTypes.go		patch \| blob \| history
src/simd/_gen/simdgen/gen_simdrules.go		patch \| blob \| history
src/simd/_gen/simdgen/godefs.go		patch \| blob \| history
src/simd/_gen/simdgen/ops/Moves/categories.yaml		patch \| blob \| history
src/simd/_gen/simdgen/ops/Moves/go.yaml		patch \| blob \| history
src/simd/internal/simd_test/simd_test.go		patch \| blob \| history
src/simd/ops_amd64.go		patch \| blob \| history
src/simd/ops_internal_amd64.go		patch \| blob \| history
src/simd/shuffles_amd64.go		patch \| blob \| history