From 4d26d66a49c51b5a7c610c4815322809b31962d9 Mon Sep 17 00:00:00 2001
From: David Chase <drchase@google.com>
Date: Mon, 17 Nov 2025 15:31:36 -0500
Subject: [PATCH] [dev.simd] simd: fix signatures for PermuteConstant* methods

This moves the packed-immediate methods to package-private,
and adds exported versions with four parameters.

Rename PermuteConstant to PermuteScalars
Rename VPSHUFB Permute to PermuteOrZero
Rename Permute2 to ConcatPermute

Comments were repaired/enhanced.

Modified the generator to support an additional tag
"hideMaskMethods : true" to suppress method, intrinsic,
generic, and generic translation generation for said
mask-modified versions of such methods (this is already
true for exported methods).

Change-Id: I91e208c1fff1f28ebce4edb4e73d26003715018c
Reviewed-on: https://go-review.googlesource.com/c/go/+/721342
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
---
 src/cmd/compile/internal/amd64/simdssa.go     |  271 ++--
 .../compile/internal/ssa/_gen/simdAMD64.rules |  231 ++--
 .../compile/internal/ssa/_gen/simdAMD64ops.go |   11 +
 .../internal/ssa/_gen/simdgenericOps.go       |  110 +-
 src/cmd/compile/internal/ssa/opGen.go         |  862 ++++++++-----
 src/cmd/compile/internal/ssa/rewriteAMD64.go  | 1128 +++++++++--------
 .../compile/internal/ssagen/simdintrinsics.go |  114 +-
 src/simd/_gen/simdgen/gen_simdGenericOps.go   |    3 +
 src/simd/_gen/simdgen/gen_simdIntrinsics.go   |    3 +
 src/simd/_gen/simdgen/gen_simdTypes.go        |    3 +
 src/simd/_gen/simdgen/gen_simdrules.go        |    3 +-
 src/simd/_gen/simdgen/godefs.go               |   33 +-
 .../_gen/simdgen/ops/Moves/categories.yaml    |   32 +-
 src/simd/_gen/simdgen/ops/Moves/go.yaml       |  156 ++-
 src/simd/internal/simd_test/simd_test.go      |   89 +-
 src/simd/ops_amd64.go                         |  848 +++++--------
 src/simd/ops_internal_amd64.go                |  214 ++++
 src/simd/shuffles_amd64.go                    |  277 ++++
 18 files changed, 2591 insertions(+), 1797 deletions(-)

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 3f8ce17972..b70a72b2f8 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -396,7 +396,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPOR256,
 		ssa.OpAMD64VPORD512,
 		ssa.OpAMD64VPORQ512,
-		ssa.OpAMD64VPSHUFB128,
+		ssa.OpAMD64VPERMB128,
 		ssa.OpAMD64VPERMB256,
 		ssa.OpAMD64VPERMB512,
 		ssa.OpAMD64VPERMW128,
@@ -410,6 +410,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMQ256,
 		ssa.OpAMD64VPERMPD512,
 		ssa.OpAMD64VPERMQ512,
+		ssa.OpAMD64VPSHUFB128,
 		ssa.OpAMD64VPSHUFB256,
 		ssa.OpAMD64VPSHUFB512,
 		ssa.OpAMD64VPROLVD128,
@@ -672,9 +673,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked128,
 		ssa.OpAMD64VPORQMasked256,
 		ssa.OpAMD64VPORQMasked512,
-		ssa.OpAMD64VPSHUFBMasked256,
-		ssa.OpAMD64VPSHUFBMasked512,
-		ssa.OpAMD64VPSHUFBMasked128,
+		ssa.OpAMD64VPERMBMasked128,
 		ssa.OpAMD64VPERMBMasked256,
 		ssa.OpAMD64VPERMBMasked512,
 		ssa.OpAMD64VPERMWMasked128,
@@ -688,6 +687,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMQMasked256,
 		ssa.OpAMD64VPERMPDMasked512,
 		ssa.OpAMD64VPERMQMasked512,
+		ssa.OpAMD64VPSHUFBMasked256,
+		ssa.OpAMD64VPSHUFBMasked512,
+		ssa.OpAMD64VPSHUFBMasked128,
 		ssa.OpAMD64VPROLVDMasked128,
 		ssa.OpAMD64VPROLVDMasked256,
 		ssa.OpAMD64VPROLVDMasked512,
@@ -1011,12 +1013,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VEXTRACTF64X4256,
 		ssa.OpAMD64VEXTRACTI128128,
 		ssa.OpAMD64VEXTRACTI64X4256,
-		ssa.OpAMD64VPSHUFD128,
-		ssa.OpAMD64VPSHUFD256,
-		ssa.OpAMD64VPSHUFD512,
-		ssa.OpAMD64VPSHUFHW128,
-		ssa.OpAMD64VPSHUFHW256,
-		ssa.OpAMD64VPSHUFHW512,
 		ssa.OpAMD64VPROLD128,
 		ssa.OpAMD64VPROLD256,
 		ssa.OpAMD64VPROLD512,
@@ -1029,6 +1025,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQ128,
 		ssa.OpAMD64VPRORQ256,
 		ssa.OpAMD64VPRORQ512,
+		ssa.OpAMD64VPSHUFD128,
+		ssa.OpAMD64VPSHUFD256,
+		ssa.OpAMD64VPSHUFD512,
+		ssa.OpAMD64VPSHUFHW128,
+		ssa.OpAMD64VPSHUFHW256,
+		ssa.OpAMD64VPSHUFHW512,
+		ssa.OpAMD64VPSHUFLW128,
+		ssa.OpAMD64VPSHUFLW256,
+		ssa.OpAMD64VPSHUFLW512,
 		ssa.OpAMD64VPSLLW128const,
 		ssa.OpAMD64VPSLLW256const,
 		ssa.OpAMD64VPSLLW512const,
@@ -1070,12 +1075,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPDMasked128,
 		ssa.OpAMD64VREDUCEPDMasked256,
 		ssa.OpAMD64VREDUCEPDMasked512,
-		ssa.OpAMD64VPSHUFDMasked256,
-		ssa.OpAMD64VPSHUFDMasked512,
-		ssa.OpAMD64VPSHUFHWMasked256,
-		ssa.OpAMD64VPSHUFHWMasked512,
-		ssa.OpAMD64VPSHUFHWMasked128,
-		ssa.OpAMD64VPSHUFDMasked128,
 		ssa.OpAMD64VPROLDMasked128,
 		ssa.OpAMD64VPROLDMasked256,
 		ssa.OpAMD64VPROLDMasked512,
@@ -1088,6 +1087,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQMasked128,
 		ssa.OpAMD64VPRORQMasked256,
 		ssa.OpAMD64VPRORQMasked512,
+		ssa.OpAMD64VPSHUFDMasked256,
+		ssa.OpAMD64VPSHUFDMasked512,
+		ssa.OpAMD64VPSHUFHWMasked256,
+		ssa.OpAMD64VPSHUFHWMasked512,
+		ssa.OpAMD64VPSHUFHWMasked128,
+		ssa.OpAMD64VPSHUFLWMasked256,
+		ssa.OpAMD64VPSHUFLWMasked512,
+		ssa.OpAMD64VPSHUFLWMasked128,
+		ssa.OpAMD64VPSHUFDMasked128,
 		ssa.OpAMD64VPSLLWMasked128const,
 		ssa.OpAMD64VPSLLWMasked256const,
 		ssa.OpAMD64VPSLLWMasked512const,
@@ -1209,6 +1217,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 	case ssa.OpAMD64VPDPWSSD128,
 		ssa.OpAMD64VPDPWSSD256,
 		ssa.OpAMD64VPDPWSSD512,
+		ssa.OpAMD64VPERMI2B128,
+		ssa.OpAMD64VPERMI2B256,
+		ssa.OpAMD64VPERMI2B512,
+		ssa.OpAMD64VPERMI2W128,
+		ssa.OpAMD64VPERMI2W256,
+		ssa.OpAMD64VPERMI2W512,
+		ssa.OpAMD64VPERMI2PS128,
+		ssa.OpAMD64VPERMI2D128,
+		ssa.OpAMD64VPERMI2PS256,
+		ssa.OpAMD64VPERMI2D256,
+		ssa.OpAMD64VPERMI2PS512,
+		ssa.OpAMD64VPERMI2D512,
+		ssa.OpAMD64VPERMI2PD128,
+		ssa.OpAMD64VPERMI2Q128,
+		ssa.OpAMD64VPERMI2PD256,
+		ssa.OpAMD64VPERMI2Q256,
+		ssa.OpAMD64VPERMI2PD512,
+		ssa.OpAMD64VPERMI2Q512,
 		ssa.OpAMD64VPDPBUSD128,
 		ssa.OpAMD64VPDPBUSD256,
 		ssa.OpAMD64VPDPBUSD512,
@@ -1233,24 +1259,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PD128,
 		ssa.OpAMD64VFMSUBADD213PD256,
 		ssa.OpAMD64VFMSUBADD213PD512,
-		ssa.OpAMD64VPERMI2B128,
-		ssa.OpAMD64VPERMI2B256,
-		ssa.OpAMD64VPERMI2B512,
-		ssa.OpAMD64VPERMI2W128,
-		ssa.OpAMD64VPERMI2W256,
-		ssa.OpAMD64VPERMI2W512,
-		ssa.OpAMD64VPERMI2PS128,
-		ssa.OpAMD64VPERMI2D128,
-		ssa.OpAMD64VPERMI2PS256,
-		ssa.OpAMD64VPERMI2D256,
-		ssa.OpAMD64VPERMI2PS512,
-		ssa.OpAMD64VPERMI2D512,
-		ssa.OpAMD64VPERMI2PD128,
-		ssa.OpAMD64VPERMI2Q128,
-		ssa.OpAMD64VPERMI2PD256,
-		ssa.OpAMD64VPERMI2Q256,
-		ssa.OpAMD64VPERMI2PD512,
-		ssa.OpAMD64VPERMI2Q512,
 		ssa.OpAMD64VPSHLDVW128,
 		ssa.OpAMD64VPSHLDVW256,
 		ssa.OpAMD64VPSHLDVW512,
@@ -1316,6 +1324,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPAVGWMasked128Merging,
 		ssa.OpAMD64VPAVGWMasked256Merging,
 		ssa.OpAMD64VPAVGWMasked512Merging,
+		ssa.OpAMD64VPERMI2BMasked128,
+		ssa.OpAMD64VPERMI2BMasked256,
+		ssa.OpAMD64VPERMI2BMasked512,
+		ssa.OpAMD64VPERMI2WMasked128,
+		ssa.OpAMD64VPERMI2WMasked256,
+		ssa.OpAMD64VPERMI2WMasked512,
+		ssa.OpAMD64VPERMI2PSMasked128,
+		ssa.OpAMD64VPERMI2DMasked128,
+		ssa.OpAMD64VPERMI2PSMasked256,
+		ssa.OpAMD64VPERMI2DMasked256,
+		ssa.OpAMD64VPERMI2PSMasked512,
+		ssa.OpAMD64VPERMI2DMasked512,
+		ssa.OpAMD64VPERMI2PDMasked128,
+		ssa.OpAMD64VPERMI2QMasked128,
+		ssa.OpAMD64VPERMI2PDMasked256,
+		ssa.OpAMD64VPERMI2QMasked256,
+		ssa.OpAMD64VPERMI2PDMasked512,
+		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPALIGNRMasked256Merging,
 		ssa.OpAMD64VPALIGNRMasked512Merging,
 		ssa.OpAMD64VPALIGNRMasked128Merging,
@@ -1451,24 +1477,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked128Merging,
 		ssa.OpAMD64VPORQMasked256Merging,
 		ssa.OpAMD64VPORQMasked512Merging,
-		ssa.OpAMD64VPERMI2BMasked128,
-		ssa.OpAMD64VPERMI2BMasked256,
-		ssa.OpAMD64VPERMI2BMasked512,
-		ssa.OpAMD64VPERMI2WMasked128,
-		ssa.OpAMD64VPERMI2WMasked256,
-		ssa.OpAMD64VPERMI2WMasked512,
-		ssa.OpAMD64VPERMI2PSMasked128,
-		ssa.OpAMD64VPERMI2DMasked128,
-		ssa.OpAMD64VPERMI2PSMasked256,
-		ssa.OpAMD64VPERMI2DMasked256,
-		ssa.OpAMD64VPERMI2PSMasked512,
-		ssa.OpAMD64VPERMI2DMasked512,
-		ssa.OpAMD64VPERMI2PDMasked128,
-		ssa.OpAMD64VPERMI2QMasked128,
-		ssa.OpAMD64VPERMI2PDMasked256,
-		ssa.OpAMD64VPERMI2QMasked256,
-		ssa.OpAMD64VPERMI2PDMasked512,
-		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPSHUFBMasked256Merging,
 		ssa.OpAMD64VPSHUFBMasked512Merging,
 		ssa.OpAMD64VPSHUFBMasked128Merging,
@@ -1819,6 +1827,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		p = simdV21load(s, v)
 
 	case ssa.OpAMD64VPDPWSSD512load,
+		ssa.OpAMD64VPERMI2PS128load,
+		ssa.OpAMD64VPERMI2D128load,
+		ssa.OpAMD64VPERMI2PS256load,
+		ssa.OpAMD64VPERMI2D256load,
+		ssa.OpAMD64VPERMI2PS512load,
+		ssa.OpAMD64VPERMI2D512load,
+		ssa.OpAMD64VPERMI2PD128load,
+		ssa.OpAMD64VPERMI2Q128load,
+		ssa.OpAMD64VPERMI2PD256load,
+		ssa.OpAMD64VPERMI2Q256load,
+		ssa.OpAMD64VPERMI2PD512load,
+		ssa.OpAMD64VPERMI2Q512load,
 		ssa.OpAMD64VPDPBUSD512load,
 		ssa.OpAMD64VPDPBUSDS512load,
 		ssa.OpAMD64VFMADD213PS128load,
@@ -1839,18 +1859,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PD128load,
 		ssa.OpAMD64VFMSUBADD213PD256load,
 		ssa.OpAMD64VFMSUBADD213PD512load,
-		ssa.OpAMD64VPERMI2PS128load,
-		ssa.OpAMD64VPERMI2D128load,
-		ssa.OpAMD64VPERMI2PS256load,
-		ssa.OpAMD64VPERMI2D256load,
-		ssa.OpAMD64VPERMI2PS512load,
-		ssa.OpAMD64VPERMI2D512load,
-		ssa.OpAMD64VPERMI2PD128load,
-		ssa.OpAMD64VPERMI2Q128load,
-		ssa.OpAMD64VPERMI2PD256load,
-		ssa.OpAMD64VPERMI2Q256load,
-		ssa.OpAMD64VPERMI2PD512load,
-		ssa.OpAMD64VPERMI2Q512load,
 		ssa.OpAMD64VPSHLDVD128load,
 		ssa.OpAMD64VPSHLDVD256load,
 		ssa.OpAMD64VPSHLDVD512load,
@@ -1868,6 +1876,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 	case ssa.OpAMD64VPDPWSSDMasked128load,
 		ssa.OpAMD64VPDPWSSDMasked256load,
 		ssa.OpAMD64VPDPWSSDMasked512load,
+		ssa.OpAMD64VPERMI2PSMasked128load,
+		ssa.OpAMD64VPERMI2DMasked128load,
+		ssa.OpAMD64VPERMI2PSMasked256load,
+		ssa.OpAMD64VPERMI2DMasked256load,
+		ssa.OpAMD64VPERMI2PSMasked512load,
+		ssa.OpAMD64VPERMI2DMasked512load,
+		ssa.OpAMD64VPERMI2PDMasked128load,
+		ssa.OpAMD64VPERMI2QMasked128load,
+		ssa.OpAMD64VPERMI2PDMasked256load,
+		ssa.OpAMD64VPERMI2QMasked256load,
+		ssa.OpAMD64VPERMI2PDMasked512load,
+		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPDPBUSDMasked128load,
 		ssa.OpAMD64VPDPBUSDMasked256load,
 		ssa.OpAMD64VPDPBUSDMasked512load,
@@ -1892,18 +1912,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PDMasked128load,
 		ssa.OpAMD64VFMSUBADD213PDMasked256load,
 		ssa.OpAMD64VFMSUBADD213PDMasked512load,
-		ssa.OpAMD64VPERMI2PSMasked128load,
-		ssa.OpAMD64VPERMI2DMasked128load,
-		ssa.OpAMD64VPERMI2PSMasked256load,
-		ssa.OpAMD64VPERMI2DMasked256load,
-		ssa.OpAMD64VPERMI2PSMasked512load,
-		ssa.OpAMD64VPERMI2DMasked512load,
-		ssa.OpAMD64VPERMI2PDMasked128load,
-		ssa.OpAMD64VPERMI2QMasked128load,
-		ssa.OpAMD64VPERMI2PDMasked256load,
-		ssa.OpAMD64VPERMI2QMasked256load,
-		ssa.OpAMD64VPERMI2PDMasked512load,
-		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPSHLDVDMasked128load,
 		ssa.OpAMD64VPSHLDVDMasked256load,
 		ssa.OpAMD64VPSHLDVDMasked512load,
@@ -2124,7 +2132,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPD128load,
 		ssa.OpAMD64VREDUCEPD256load,
 		ssa.OpAMD64VREDUCEPD512load,
-		ssa.OpAMD64VPSHUFD512load,
 		ssa.OpAMD64VPROLD128load,
 		ssa.OpAMD64VPROLD256load,
 		ssa.OpAMD64VPROLD512load,
@@ -2137,6 +2144,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQ128load,
 		ssa.OpAMD64VPRORQ256load,
 		ssa.OpAMD64VPRORQ512load,
+		ssa.OpAMD64VPSHUFD512load,
 		ssa.OpAMD64VPSLLD512constload,
 		ssa.OpAMD64VPSLLQ512constload,
 		ssa.OpAMD64VPSRLD512constload,
@@ -2159,9 +2167,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPDMasked128load,
 		ssa.OpAMD64VREDUCEPDMasked256load,
 		ssa.OpAMD64VREDUCEPDMasked512load,
-		ssa.OpAMD64VPSHUFDMasked256load,
-		ssa.OpAMD64VPSHUFDMasked512load,
-		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPROLDMasked128load,
 		ssa.OpAMD64VPROLDMasked256load,
 		ssa.OpAMD64VPROLDMasked512load,
@@ -2174,6 +2179,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQMasked128load,
 		ssa.OpAMD64VPRORQMasked256load,
 		ssa.OpAMD64VPRORQMasked512load,
+		ssa.OpAMD64VPSHUFDMasked256load,
+		ssa.OpAMD64VPSHUFDMasked512load,
+		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPSLLDMasked128constload,
 		ssa.OpAMD64VPSLLDMasked256constload,
 		ssa.OpAMD64VPSLLDMasked512constload,
@@ -2447,12 +2455,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPOPCNTQMasked128Merging,
 		ssa.OpAMD64VPOPCNTQMasked256Merging,
 		ssa.OpAMD64VPOPCNTQMasked512Merging,
-		ssa.OpAMD64VPSHUFDMasked256Merging,
-		ssa.OpAMD64VPSHUFDMasked512Merging,
-		ssa.OpAMD64VPSHUFHWMasked256Merging,
-		ssa.OpAMD64VPSHUFHWMasked512Merging,
-		ssa.OpAMD64VPSHUFHWMasked128Merging,
-		ssa.OpAMD64VPSHUFDMasked128Merging,
 		ssa.OpAMD64VRCP14PSMasked128Merging,
 		ssa.OpAMD64VRCP14PSMasked256Merging,
 		ssa.OpAMD64VRCP14PSMasked512Merging,
@@ -2483,6 +2485,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VSQRTPDMasked128Merging,
 		ssa.OpAMD64VSQRTPDMasked256Merging,
 		ssa.OpAMD64VSQRTPDMasked512Merging,
+		ssa.OpAMD64VPSHUFDMasked256Merging,
+		ssa.OpAMD64VPSHUFDMasked512Merging,
+		ssa.OpAMD64VPSHUFHWMasked256Merging,
+		ssa.OpAMD64VPSHUFHWMasked512Merging,
+		ssa.OpAMD64VPSHUFHWMasked128Merging,
+		ssa.OpAMD64VPSHUFLWMasked256Merging,
+		ssa.OpAMD64VPSHUFLWMasked512Merging,
+		ssa.OpAMD64VPSHUFLWMasked128Merging,
+		ssa.OpAMD64VPSHUFDMasked128Merging,
 		ssa.OpAMD64VPSLLWMasked128constMerging,
 		ssa.OpAMD64VPSLLWMasked256constMerging,
 		ssa.OpAMD64VPSLLWMasked512constMerging,
@@ -2674,6 +2685,36 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPCOMPRESSQMasked128,
 		ssa.OpAMD64VPCOMPRESSQMasked256,
 		ssa.OpAMD64VPCOMPRESSQMasked512,
+		ssa.OpAMD64VPERMI2BMasked128,
+		ssa.OpAMD64VPERMI2BMasked256,
+		ssa.OpAMD64VPERMI2BMasked512,
+		ssa.OpAMD64VPERMI2WMasked128,
+		ssa.OpAMD64VPERMI2WMasked256,
+		ssa.OpAMD64VPERMI2WMasked512,
+		ssa.OpAMD64VPERMI2PSMasked128,
+		ssa.OpAMD64VPERMI2PSMasked128load,
+		ssa.OpAMD64VPERMI2DMasked128,
+		ssa.OpAMD64VPERMI2DMasked128load,
+		ssa.OpAMD64VPERMI2PSMasked256,
+		ssa.OpAMD64VPERMI2PSMasked256load,
+		ssa.OpAMD64VPERMI2DMasked256,
+		ssa.OpAMD64VPERMI2DMasked256load,
+		ssa.OpAMD64VPERMI2PSMasked512,
+		ssa.OpAMD64VPERMI2PSMasked512load,
+		ssa.OpAMD64VPERMI2DMasked512,
+		ssa.OpAMD64VPERMI2DMasked512load,
+		ssa.OpAMD64VPERMI2PDMasked128,
+		ssa.OpAMD64VPERMI2PDMasked128load,
+		ssa.OpAMD64VPERMI2QMasked128,
+		ssa.OpAMD64VPERMI2QMasked128load,
+		ssa.OpAMD64VPERMI2PDMasked256,
+		ssa.OpAMD64VPERMI2PDMasked256load,
+		ssa.OpAMD64VPERMI2QMasked256,
+		ssa.OpAMD64VPERMI2QMasked256load,
+		ssa.OpAMD64VPERMI2PDMasked512,
+		ssa.OpAMD64VPERMI2PDMasked512load,
+		ssa.OpAMD64VPERMI2QMasked512,
+		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPALIGNRMasked256,
 		ssa.OpAMD64VPALIGNRMasked512,
 		ssa.OpAMD64VPALIGNRMasked128,
@@ -3061,48 +3102,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked256load,
 		ssa.OpAMD64VPORQMasked512,
 		ssa.OpAMD64VPORQMasked512load,
-		ssa.OpAMD64VPERMI2BMasked128,
-		ssa.OpAMD64VPERMI2BMasked256,
-		ssa.OpAMD64VPERMI2BMasked512,
-		ssa.OpAMD64VPERMI2WMasked128,
-		ssa.OpAMD64VPERMI2WMasked256,
-		ssa.OpAMD64VPERMI2WMasked512,
-		ssa.OpAMD64VPERMI2PSMasked128,
-		ssa.OpAMD64VPERMI2PSMasked128load,
-		ssa.OpAMD64VPERMI2DMasked128,
-		ssa.OpAMD64VPERMI2DMasked128load,
-		ssa.OpAMD64VPERMI2PSMasked256,
-		ssa.OpAMD64VPERMI2PSMasked256load,
-		ssa.OpAMD64VPERMI2DMasked256,
-		ssa.OpAMD64VPERMI2DMasked256load,
-		ssa.OpAMD64VPERMI2PSMasked512,
-		ssa.OpAMD64VPERMI2PSMasked512load,
-		ssa.OpAMD64VPERMI2DMasked512,
-		ssa.OpAMD64VPERMI2DMasked512load,
-		ssa.OpAMD64VPERMI2PDMasked128,
-		ssa.OpAMD64VPERMI2PDMasked128load,
-		ssa.OpAMD64VPERMI2QMasked128,
-		ssa.OpAMD64VPERMI2QMasked128load,
-		ssa.OpAMD64VPERMI2PDMasked256,
-		ssa.OpAMD64VPERMI2PDMasked256load,
-		ssa.OpAMD64VPERMI2QMasked256,
-		ssa.OpAMD64VPERMI2QMasked256load,
-		ssa.OpAMD64VPERMI2PDMasked512,
-		ssa.OpAMD64VPERMI2PDMasked512load,
-		ssa.OpAMD64VPERMI2QMasked512,
-		ssa.OpAMD64VPERMI2QMasked512load,
-		ssa.OpAMD64VPSHUFDMasked256,
-		ssa.OpAMD64VPSHUFDMasked256load,
-		ssa.OpAMD64VPSHUFDMasked512,
-		ssa.OpAMD64VPSHUFDMasked512load,
-		ssa.OpAMD64VPSHUFHWMasked256,
-		ssa.OpAMD64VPSHUFHWMasked512,
-		ssa.OpAMD64VPSHUFHWMasked128,
-		ssa.OpAMD64VPSHUFDMasked128,
-		ssa.OpAMD64VPSHUFDMasked128load,
-		ssa.OpAMD64VPSHUFBMasked256,
-		ssa.OpAMD64VPSHUFBMasked512,
-		ssa.OpAMD64VPSHUFBMasked128,
+		ssa.OpAMD64VPERMBMasked128,
 		ssa.OpAMD64VPERMBMasked256,
 		ssa.OpAMD64VPERMBMasked512,
 		ssa.OpAMD64VPERMWMasked128,
@@ -3124,6 +3124,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMPDMasked512load,
 		ssa.OpAMD64VPERMQMasked512,
 		ssa.OpAMD64VPERMQMasked512load,
+		ssa.OpAMD64VPSHUFBMasked256,
+		ssa.OpAMD64VPSHUFBMasked512,
+		ssa.OpAMD64VPSHUFBMasked128,
 		ssa.OpAMD64VRCP14PSMasked128,
 		ssa.OpAMD64VRCP14PSMasked128load,
 		ssa.OpAMD64VRCP14PSMasked256,
@@ -3418,6 +3421,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VMOVDQU64Masked128,
 		ssa.OpAMD64VMOVDQU64Masked256,
 		ssa.OpAMD64VMOVDQU64Masked512,
+		ssa.OpAMD64VPSHUFDMasked256,
+		ssa.OpAMD64VPSHUFDMasked256load,
+		ssa.OpAMD64VPSHUFDMasked512,
+		ssa.OpAMD64VPSHUFDMasked512load,
+		ssa.OpAMD64VPSHUFHWMasked256,
+		ssa.OpAMD64VPSHUFHWMasked512,
+		ssa.OpAMD64VPSHUFHWMasked128,
+		ssa.OpAMD64VPSHUFLWMasked256,
+		ssa.OpAMD64VPSHUFLWMasked512,
+		ssa.OpAMD64VPSHUFLWMasked128,
+		ssa.OpAMD64VPSHUFDMasked128,
+		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPSLLWMasked128const,
 		ssa.OpAMD64VPSLLWMasked256const,
 		ssa.OpAMD64VPSLLWMasked512const,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 5a9a1c0bc7..283a2e53cd 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -216,6 +216,36 @@
 (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
+(ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
+(ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
+(ConcatPermuteFloat64x2 ...) => (VPERMI2PD128 ...)
+(ConcatPermuteFloat64x4 ...) => (VPERMI2PD256 ...)
+(ConcatPermuteFloat64x8 ...) => (VPERMI2PD512 ...)
+(ConcatPermuteInt8x16 ...) => (VPERMI2B128 ...)
+(ConcatPermuteInt8x32 ...) => (VPERMI2B256 ...)
+(ConcatPermuteInt8x64 ...) => (VPERMI2B512 ...)
+(ConcatPermuteInt16x8 ...) => (VPERMI2W128 ...)
+(ConcatPermuteInt16x16 ...) => (VPERMI2W256 ...)
+(ConcatPermuteInt16x32 ...) => (VPERMI2W512 ...)
+(ConcatPermuteInt32x4 ...) => (VPERMI2D128 ...)
+(ConcatPermuteInt32x8 ...) => (VPERMI2D256 ...)
+(ConcatPermuteInt32x16 ...) => (VPERMI2D512 ...)
+(ConcatPermuteInt64x2 ...) => (VPERMI2Q128 ...)
+(ConcatPermuteInt64x4 ...) => (VPERMI2Q256 ...)
+(ConcatPermuteInt64x8 ...) => (VPERMI2Q512 ...)
+(ConcatPermuteUint8x16 ...) => (VPERMI2B128 ...)
+(ConcatPermuteUint8x32 ...) => (VPERMI2B256 ...)
+(ConcatPermuteUint8x64 ...) => (VPERMI2B512 ...)
+(ConcatPermuteUint16x8 ...) => (VPERMI2W128 ...)
+(ConcatPermuteUint16x16 ...) => (VPERMI2W256 ...)
+(ConcatPermuteUint16x32 ...) => (VPERMI2W512 ...)
+(ConcatPermuteUint32x4 ...) => (VPERMI2D128 ...)
+(ConcatPermuteUint32x8 ...) => (VPERMI2D256 ...)
+(ConcatPermuteUint32x16 ...) => (VPERMI2D512 ...)
+(ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
+(ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
+(ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
 (ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
 (ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
 (ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
@@ -794,7 +824,7 @@
 (PermuteFloat32x16 ...) => (VPERMPS512 ...)
 (PermuteFloat64x4 ...) => (VPERMPD256 ...)
 (PermuteFloat64x8 ...) => (VPERMPD512 ...)
-(PermuteInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteInt8x16 ...) => (VPERMB128 ...)
 (PermuteInt8x32 ...) => (VPERMB256 ...)
 (PermuteInt8x64 ...) => (VPERMB512 ...)
 (PermuteInt16x8 ...) => (VPERMW128 ...)
@@ -804,7 +834,7 @@
 (PermuteInt32x16 ...) => (VPERMD512 ...)
 (PermuteInt64x4 ...) => (VPERMQ256 ...)
 (PermuteInt64x8 ...) => (VPERMQ512 ...)
-(PermuteUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteUint8x16 ...) => (VPERMB128 ...)
 (PermuteUint8x32 ...) => (VPERMB256 ...)
 (PermuteUint8x64 ...) => (VPERMB512 ...)
 (PermuteUint16x8 ...) => (VPERMW128 ...)
@@ -814,62 +844,12 @@
 (PermuteUint32x16 ...) => (VPERMD512 ...)
 (PermuteUint64x4 ...) => (VPERMQ256 ...)
 (PermuteUint64x8 ...) => (VPERMQ512 ...)
-(Permute2Float32x4 ...) => (VPERMI2PS128 ...)
-(Permute2Float32x8 ...) => (VPERMI2PS256 ...)
-(Permute2Float32x16 ...) => (VPERMI2PS512 ...)
-(Permute2Float64x2 ...) => (VPERMI2PD128 ...)
-(Permute2Float64x4 ...) => (VPERMI2PD256 ...)
-(Permute2Float64x8 ...) => (VPERMI2PD512 ...)
-(Permute2Int8x16 ...) => (VPERMI2B128 ...)
-(Permute2Int8x32 ...) => (VPERMI2B256 ...)
-(Permute2Int8x64 ...) => (VPERMI2B512 ...)
-(Permute2Int16x8 ...) => (VPERMI2W128 ...)
-(Permute2Int16x16 ...) => (VPERMI2W256 ...)
-(Permute2Int16x32 ...) => (VPERMI2W512 ...)
-(Permute2Int32x4 ...) => (VPERMI2D128 ...)
-(Permute2Int32x8 ...) => (VPERMI2D256 ...)
-(Permute2Int32x16 ...) => (VPERMI2D512 ...)
-(Permute2Int64x2 ...) => (VPERMI2Q128 ...)
-(Permute2Int64x4 ...) => (VPERMI2Q256 ...)
-(Permute2Int64x8 ...) => (VPERMI2Q512 ...)
-(Permute2Uint8x16 ...) => (VPERMI2B128 ...)
-(Permute2Uint8x32 ...) => (VPERMI2B256 ...)
-(Permute2Uint8x64 ...) => (VPERMI2B512 ...)
-(Permute2Uint16x8 ...) => (VPERMI2W128 ...)
-(Permute2Uint16x16 ...) => (VPERMI2W256 ...)
-(Permute2Uint16x32 ...) => (VPERMI2W512 ...)
-(Permute2Uint32x4 ...) => (VPERMI2D128 ...)
-(Permute2Uint32x8 ...) => (VPERMI2D256 ...)
-(Permute2Uint32x16 ...) => (VPERMI2D512 ...)
-(Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
-(Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
-(Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
-(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
-(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
-(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
-(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
-(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
-(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
-(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
-(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
-(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
-(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
-(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
+(PermuteOrZeroInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteOrZeroUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteOrZeroGroupedInt8x32 ...) => (VPSHUFB256 ...)
+(PermuteOrZeroGroupedInt8x64 ...) => (VPSHUFB512 ...)
+(PermuteOrZeroGroupedUint8x32 ...) => (VPSHUFB256 ...)
+(PermuteOrZeroGroupedUint8x64 ...) => (VPSHUFB512 ...)
 (ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
 (ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
 (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
@@ -1324,6 +1304,24 @@
 (concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...)
 (concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...)
 (concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...)
+(permuteScalarsInt32x4 ...) => (VPSHUFD128 ...)
+(permuteScalarsUint32x4 ...) => (VPSHUFD128 ...)
+(permuteScalarsGroupedInt32x8 ...) => (VPSHUFD256 ...)
+(permuteScalarsGroupedInt32x16 ...) => (VPSHUFD512 ...)
+(permuteScalarsGroupedUint32x8 ...) => (VPSHUFD256 ...)
+(permuteScalarsGroupedUint32x16 ...) => (VPSHUFD512 ...)
+(permuteScalarsHiInt16x8 ...) => (VPSHUFHW128 ...)
+(permuteScalarsHiUint16x8 ...) => (VPSHUFHW128 ...)
+(permuteScalarsHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
+(permuteScalarsHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
+(permuteScalarsHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
+(permuteScalarsHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
+(permuteScalarsLoInt16x8 ...) => (VPSHUFLW128 ...)
+(permuteScalarsLoUint16x8 ...) => (VPSHUFLW128 ...)
+(permuteScalarsLoGroupedInt16x16 ...) => (VPSHUFLW256 ...)
+(permuteScalarsLoGroupedInt16x32 ...) => (VPSHUFLW512 ...)
+(permuteScalarsLoGroupedUint16x16 ...) => (VPSHUFLW256 ...)
+(permuteScalarsLoGroupedUint16x32 ...) => (VPSHUFLW512 ...)
 (ternInt32x4 ...) => (VPTERNLOGD128 ...)
 (ternInt32x8 ...) => (VPTERNLOGD256 ...)
 (ternInt32x16 ...) => (VPTERNLOGD512 ...)
@@ -1417,6 +1415,24 @@
 (VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask)
 (VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask)
 (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
+(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
+(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
+(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
+(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
+(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
+(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
+(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
+(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
+(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
+(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
+(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
+(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
+(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
+(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
+(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
+(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
+(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
+(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
 (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask)
 (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask)
 (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask)
@@ -1668,33 +1684,7 @@
 (VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask)
 (VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask)
-(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
-(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
-(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
-(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
-(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
-(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
-(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
-(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
-(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
-(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
-(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
-(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
-(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
-(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
-(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
-(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
-(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
-(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
-(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
-(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
-(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
-(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
-(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
-(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
-(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
-(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
-(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
+(VMOVDQU8Masked128 (VPERMB128 x y) mask) => (VPERMBMasked128 x y mask)
 (VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask)
 (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
 (VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask)
@@ -1708,6 +1698,9 @@
 (VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask)
 (VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask)
+(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
+(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
+(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
 (VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask)
 (VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask)
 (VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask)
@@ -1874,6 +1867,15 @@
 (VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask)
 (VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask)
+(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
+(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
+(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
+(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
+(VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) => (VPSHUFLWMasked256 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512 [a] x mask)
+(VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) => (VPSHUFLWMasked128 [a] x mask)
+(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
 (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask)
 (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask)
 (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask)
@@ -2021,6 +2023,7 @@
 (VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask)
 (VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask)
 (VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask)
+(VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask)
 (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask)
 (VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask)
 (VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask)
@@ -2170,6 +2173,7 @@
 (VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask))
@@ -2305,6 +2309,7 @@
 (VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask))
@@ -2410,6 +2415,30 @@
 (VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
+(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
+(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
+(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
+(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
+(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
+(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
+(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
+(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
+(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
+(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
+(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
+(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
 (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
 (VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
@@ -2636,34 +2665,6 @@
 (VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem)
 (VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem)
 (VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem)
-(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
-(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
-(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
-(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
-(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
-(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
-(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
-(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
-(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
-(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
-(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
-(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
-(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
-(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem)
 (VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem)
 (VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem)
@@ -2862,6 +2863,10 @@
 (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
 (VSHUFPS512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
 (VSHUFPD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
 (VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
 (VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
index 674cfb19d6..404354d387 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -383,8 +383,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -1310,6 +1312,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPSHUFLW128", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPSHUFLW256", argLength: 1, reg: v11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPSHUFLW512", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPSHUFLWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPSHUFLWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPSHUFLWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -2392,6 +2400,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPSHUFLWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPSHUFLWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPSHUFLWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 6a79fa3856..3fae158c0a 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -207,6 +207,36 @@ func simdGenericOps() []opData {
 		{name: "CompressUint64x2", argLength: 2, commutative: false},
 		{name: "CompressUint64x4", argLength: 2, commutative: false},
 		{name: "CompressUint64x8", argLength: 2, commutative: false},
+		{name: "ConcatPermuteFloat32x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat32x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat32x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat64x2", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat64x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat64x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt8x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt8x32", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt8x64", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt16x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt16x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt16x32", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt32x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt32x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt32x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt64x2", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt64x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt64x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint8x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint8x32", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint8x64", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint16x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint16x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint16x32", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint32x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint32x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint32x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint64x2", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint64x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint64x8", argLength: 3, commutative: false},
 		{name: "ConvertToInt8Int16x8", argLength: 1, commutative: false},
 		{name: "ConvertToInt8Int16x16", argLength: 1, commutative: false},
 		{name: "ConvertToInt8Int16x32", argLength: 1, commutative: false},
@@ -750,44 +780,10 @@ func simdGenericOps() []opData {
 		{name: "OrUint64x2", argLength: 2, commutative: true},
 		{name: "OrUint64x4", argLength: 2, commutative: true},
 		{name: "OrUint64x8", argLength: 2, commutative: true},
-		{name: "Permute2Float32x4", argLength: 3, commutative: false},
-		{name: "Permute2Float32x8", argLength: 3, commutative: false},
-		{name: "Permute2Float32x16", argLength: 3, commutative: false},
-		{name: "Permute2Float64x2", argLength: 3, commutative: false},
-		{name: "Permute2Float64x4", argLength: 3, commutative: false},
-		{name: "Permute2Float64x8", argLength: 3, commutative: false},
-		{name: "Permute2Int8x16", argLength: 3, commutative: false},
-		{name: "Permute2Int8x32", argLength: 3, commutative: false},
-		{name: "Permute2Int8x64", argLength: 3, commutative: false},
-		{name: "Permute2Int16x8", argLength: 3, commutative: false},
-		{name: "Permute2Int16x16", argLength: 3, commutative: false},
-		{name: "Permute2Int16x32", argLength: 3, commutative: false},
-		{name: "Permute2Int32x4", argLength: 3, commutative: false},
-		{name: "Permute2Int32x8", argLength: 3, commutative: false},
-		{name: "Permute2Int32x16", argLength: 3, commutative: false},
-		{name: "Permute2Int64x2", argLength: 3, commutative: false},
-		{name: "Permute2Int64x4", argLength: 3, commutative: false},
-		{name: "Permute2Int64x8", argLength: 3, commutative: false},
-		{name: "Permute2Uint8x16", argLength: 3, commutative: false},
-		{name: "Permute2Uint8x32", argLength: 3, commutative: false},
-		{name: "Permute2Uint8x64", argLength: 3, commutative: false},
-		{name: "Permute2Uint16x8", argLength: 3, commutative: false},
-		{name: "Permute2Uint16x16", argLength: 3, commutative: false},
-		{name: "Permute2Uint16x32", argLength: 3, commutative: false},
-		{name: "Permute2Uint32x4", argLength: 3, commutative: false},
-		{name: "Permute2Uint32x8", argLength: 3, commutative: false},
-		{name: "Permute2Uint32x16", argLength: 3, commutative: false},
-		{name: "Permute2Uint64x2", argLength: 3, commutative: false},
-		{name: "Permute2Uint64x4", argLength: 3, commutative: false},
-		{name: "Permute2Uint64x8", argLength: 3, commutative: false},
 		{name: "PermuteFloat32x8", argLength: 2, commutative: false},
 		{name: "PermuteFloat32x16", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x4", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x8", argLength: 2, commutative: false},
-		{name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
-		{name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
-		{name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
-		{name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
 		{name: "PermuteInt8x16", argLength: 2, commutative: false},
 		{name: "PermuteInt8x32", argLength: 2, commutative: false},
 		{name: "PermuteInt8x64", argLength: 2, commutative: false},
@@ -798,6 +794,12 @@ func simdGenericOps() []opData {
 		{name: "PermuteInt32x16", argLength: 2, commutative: false},
 		{name: "PermuteInt64x4", argLength: 2, commutative: false},
 		{name: "PermuteInt64x8", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroGroupedInt8x32", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroGroupedInt8x64", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroGroupedUint8x32", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroGroupedUint8x64", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroInt8x16", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroUint8x16", argLength: 2, commutative: false},
 		{name: "PermuteUint8x16", argLength: 2, commutative: false},
 		{name: "PermuteUint8x32", argLength: 2, commutative: false},
 		{name: "PermuteUint8x64", argLength: 2, commutative: false},
@@ -1151,28 +1153,6 @@ func simdGenericOps() []opData {
 		{name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
@@ -1292,6 +1272,24 @@ func simdGenericOps() []opData {
 		{name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index ea5491362f..fa94dfbbd5 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1624,8 +1624,10 @@ const (
 	OpAMD64VPDPWSSDMasked128
 	OpAMD64VPDPWSSDMasked256
 	OpAMD64VPDPWSSDMasked512
+	OpAMD64VPERMB128
 	OpAMD64VPERMB256
 	OpAMD64VPERMB512
+	OpAMD64VPERMBMasked128
 	OpAMD64VPERMBMasked256
 	OpAMD64VPERMBMasked512
 	OpAMD64VPERMD256
@@ -2551,6 +2553,12 @@ const (
 	OpAMD64VPSHUFHWMasked128
 	OpAMD64VPSHUFHWMasked256
 	OpAMD64VPSHUFHWMasked512
+	OpAMD64VPSHUFLW128
+	OpAMD64VPSHUFLW256
+	OpAMD64VPSHUFLW512
+	OpAMD64VPSHUFLWMasked128
+	OpAMD64VPSHUFLWMasked256
+	OpAMD64VPSHUFLWMasked512
 	OpAMD64VPSLLD128const
 	OpAMD64VPSLLD256const
 	OpAMD64VPSLLD512const
@@ -3633,6 +3641,9 @@ const (
 	OpAMD64VPSHUFHWMasked128Merging
 	OpAMD64VPSHUFHWMasked256Merging
 	OpAMD64VPSHUFHWMasked512Merging
+	OpAMD64VPSHUFLWMasked128Merging
+	OpAMD64VPSHUFLWMasked256Merging
+	OpAMD64VPSHUFLWMasked512Merging
 	OpAMD64VPSLLDMasked128constMerging
 	OpAMD64VPSLLDMasked256constMerging
 	OpAMD64VPSLLDMasked512constMerging
@@ -6155,6 +6166,36 @@ const (
 	OpCompressUint64x2
 	OpCompressUint64x4
 	OpCompressUint64x8
+	OpConcatPermuteFloat32x4
+	OpConcatPermuteFloat32x8
+	OpConcatPermuteFloat32x16
+	OpConcatPermuteFloat64x2
+	OpConcatPermuteFloat64x4
+	OpConcatPermuteFloat64x8
+	OpConcatPermuteInt8x16
+	OpConcatPermuteInt8x32
+	OpConcatPermuteInt8x64
+	OpConcatPermuteInt16x8
+	OpConcatPermuteInt16x16
+	OpConcatPermuteInt16x32
+	OpConcatPermuteInt32x4
+	OpConcatPermuteInt32x8
+	OpConcatPermuteInt32x16
+	OpConcatPermuteInt64x2
+	OpConcatPermuteInt64x4
+	OpConcatPermuteInt64x8
+	OpConcatPermuteUint8x16
+	OpConcatPermuteUint8x32
+	OpConcatPermuteUint8x64
+	OpConcatPermuteUint16x8
+	OpConcatPermuteUint16x16
+	OpConcatPermuteUint16x32
+	OpConcatPermuteUint32x4
+	OpConcatPermuteUint32x8
+	OpConcatPermuteUint32x16
+	OpConcatPermuteUint64x2
+	OpConcatPermuteUint64x4
+	OpConcatPermuteUint64x8
 	OpConvertToInt8Int16x8
 	OpConvertToInt8Int16x16
 	OpConvertToInt8Int16x32
@@ -6698,44 +6739,10 @@ const (
 	OpOrUint64x2
 	OpOrUint64x4
 	OpOrUint64x8
-	OpPermute2Float32x4
-	OpPermute2Float32x8
-	OpPermute2Float32x16
-	OpPermute2Float64x2
-	OpPermute2Float64x4
-	OpPermute2Float64x8
-	OpPermute2Int8x16
-	OpPermute2Int8x32
-	OpPermute2Int8x64
-	OpPermute2Int16x8
-	OpPermute2Int16x16
-	OpPermute2Int16x32
-	OpPermute2Int32x4
-	OpPermute2Int32x8
-	OpPermute2Int32x16
-	OpPermute2Int64x2
-	OpPermute2Int64x4
-	OpPermute2Int64x8
-	OpPermute2Uint8x16
-	OpPermute2Uint8x32
-	OpPermute2Uint8x64
-	OpPermute2Uint16x8
-	OpPermute2Uint16x16
-	OpPermute2Uint16x32
-	OpPermute2Uint32x4
-	OpPermute2Uint32x8
-	OpPermute2Uint32x16
-	OpPermute2Uint64x2
-	OpPermute2Uint64x4
-	OpPermute2Uint64x8
 	OpPermuteFloat32x8
 	OpPermuteFloat32x16
 	OpPermuteFloat64x4
 	OpPermuteFloat64x8
-	OpPermuteGroupedInt8x32
-	OpPermuteGroupedInt8x64
-	OpPermuteGroupedUint8x32
-	OpPermuteGroupedUint8x64
 	OpPermuteInt8x16
 	OpPermuteInt8x32
 	OpPermuteInt8x64
@@ -6746,6 +6753,12 @@ const (
 	OpPermuteInt32x16
 	OpPermuteInt64x4
 	OpPermuteInt64x8
+	OpPermuteOrZeroGroupedInt8x32
+	OpPermuteOrZeroGroupedInt8x64
+	OpPermuteOrZeroGroupedUint8x32
+	OpPermuteOrZeroGroupedUint8x64
+	OpPermuteOrZeroInt8x16
+	OpPermuteOrZeroUint8x16
 	OpPermuteUint8x16
 	OpPermuteUint8x32
 	OpPermuteUint8x64
@@ -7099,28 +7112,6 @@ const (
 	OpGetElemUint16x8
 	OpGetElemUint32x4
 	OpGetElemUint64x2
-	OpPermuteConstantGroupedInt32x8
-	OpPermuteConstantGroupedInt32x16
-	OpPermuteConstantGroupedUint32x8
-	OpPermuteConstantGroupedUint32x16
-	OpPermuteConstantHiGroupedInt16x16
-	OpPermuteConstantHiGroupedInt16x32
-	OpPermuteConstantHiGroupedUint16x16
-	OpPermuteConstantHiGroupedUint16x32
-	OpPermuteConstantHiInt16x8
-	OpPermuteConstantHiInt32x4
-	OpPermuteConstantHiUint16x8
-	OpPermuteConstantHiUint32x4
-	OpPermuteConstantInt32x4
-	OpPermuteConstantLoGroupedInt16x16
-	OpPermuteConstantLoGroupedInt16x32
-	OpPermuteConstantLoGroupedUint16x16
-	OpPermuteConstantLoGroupedUint16x32
-	OpPermuteConstantLoInt16x8
-	OpPermuteConstantLoInt32x4
-	OpPermuteConstantLoUint16x8
-	OpPermuteConstantLoUint32x4
-	OpPermuteConstantUint32x4
 	OpRotateAllLeftInt32x4
 	OpRotateAllLeftInt32x8
 	OpRotateAllLeftInt32x16
@@ -7240,6 +7231,24 @@ const (
 	OpconcatSelectedConstantInt64x2
 	OpconcatSelectedConstantUint32x4
 	OpconcatSelectedConstantUint64x2
+	OppermuteScalarsGroupedInt32x8
+	OppermuteScalarsGroupedInt32x16
+	OppermuteScalarsGroupedUint32x8
+	OppermuteScalarsGroupedUint32x16
+	OppermuteScalarsHiGroupedInt16x16
+	OppermuteScalarsHiGroupedInt16x32
+	OppermuteScalarsHiGroupedUint16x16
+	OppermuteScalarsHiGroupedUint16x32
+	OppermuteScalarsHiInt16x8
+	OppermuteScalarsHiUint16x8
+	OppermuteScalarsInt32x4
+	OppermuteScalarsLoGroupedInt16x16
+	OppermuteScalarsLoGroupedInt16x32
+	OppermuteScalarsLoGroupedUint16x16
+	OppermuteScalarsLoGroupedUint16x32
+	OppermuteScalarsLoInt16x8
+	OppermuteScalarsLoUint16x8
+	OppermuteScalarsUint32x4
 	OpternInt32x4
 	OpternInt32x8
 	OpternInt32x16
@@ -26142,6 +26151,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMB128",
+		argLen: 2,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:   "VPERMB256",
 		argLen: 2,
@@ -26170,6 +26193,21 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMBMasked128",
+		argLen: 3,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:   "VPERMBMasked256",
 		argLen: 3,
@@ -39744,6 +39782,93 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VPSHUFLW128",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLW256",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLW512",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLWMasked128",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLWMasked256",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLWMasked512",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:    "VPSLLD128const",
 		auxType: auxUInt8,
@@ -57607,6 +57732,57 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:         "VPSHUFLWMasked128Merging",
+		auxType:      auxUInt8,
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPSHUFLWMasked256Merging",
+		auxType:      auxUInt8,
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPSHUFLWMasked512Merging",
+		auxType:      auxUInt8,
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:         "VPSLLDMasked128constMerging",
 		auxType:      auxUInt8,
@@ -86874,6 +87050,156 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "ConcatPermuteFloat32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint64x8",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "ConvertToInt8Int16x8",
 		argLen:  1,
@@ -89758,242 +90084,102 @@ var opcodeTable = [...]opInfo{
 		generic:     true,
 	},
 	{
-		name:    "Permute2Float32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float64x2",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float64x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float64x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int8x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int8x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int8x64",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int16x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int16x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int16x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int64x2",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int64x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int64x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint8x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint8x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint8x64",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint16x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint16x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint16x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint64x2",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint64x4",
-		argLen:  3,
+		name:    "PermuteFloat32x8",
+		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "Permute2Uint64x8",
-		argLen:  3,
+		name:    "PermuteFloat32x16",
+		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteFloat32x8",
+		name:    "PermuteFloat64x4",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteFloat32x16",
+		name:    "PermuteFloat64x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteFloat64x4",
+		name:    "PermuteInt8x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteFloat64x8",
+		name:    "PermuteInt8x32",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteGroupedInt8x32",
+		name:    "PermuteInt8x64",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteGroupedInt8x64",
+		name:    "PermuteInt16x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteGroupedUint8x32",
+		name:    "PermuteInt16x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteGroupedUint8x64",
+		name:    "PermuteInt16x32",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt8x16",
+		name:    "PermuteInt32x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt8x32",
+		name:    "PermuteInt32x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt8x64",
+		name:    "PermuteInt64x4",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt16x8",
+		name:    "PermuteInt64x8",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt16x16",
+		name:    "PermuteOrZeroGroupedInt8x32",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt16x32",
+		name:    "PermuteOrZeroGroupedInt8x64",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt32x8",
+		name:    "PermuteOrZeroGroupedUint8x32",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt32x16",
+		name:    "PermuteOrZeroGroupedUint8x64",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt64x4",
+		name:    "PermuteOrZeroInt8x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "PermuteInt64x8",
+		name:    "PermuteOrZeroUint8x16",
 		argLen:  2,
 		generic: true,
 	},
@@ -91830,138 +92016,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
-	{
-		name:    "PermuteConstantGroupedInt32x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantGroupedInt32x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantGroupedUint32x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantGroupedUint32x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiGroupedInt16x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiGroupedInt16x32",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiGroupedUint16x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiGroupedUint16x32",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiInt16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiInt32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiUint16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiUint32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantInt32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoGroupedInt16x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoGroupedInt16x32",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoGroupedUint16x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoGroupedUint16x32",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoInt16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoInt32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoUint16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoUint32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantUint32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
 	{
 		name:    "RotateAllLeftInt32x4",
 		auxType: auxUInt8,
@@ -92676,6 +92730,114 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "permuteScalarsGroupedInt32x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsGroupedInt32x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsGroupedUint32x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsGroupedUint32x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiGroupedInt16x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiGroupedInt16x32",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiGroupedUint16x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiGroupedUint16x32",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiInt16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiUint16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsInt32x4",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoGroupedInt16x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoGroupedInt16x32",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoGroupedUint16x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoGroupedUint16x32",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoInt16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoUint16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsUint32x4",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "ternInt32x4",
 		auxType: auxUInt8,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 76e524d524..5ad2ed3f96 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -2546,6 +2546,96 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpCompressUint8x32(v)
 	case OpCompressUint8x64:
 		return rewriteValueAMD64_OpCompressUint8x64(v)
+	case OpConcatPermuteFloat32x16:
+		v.Op = OpAMD64VPERMI2PS512
+		return true
+	case OpConcatPermuteFloat32x4:
+		v.Op = OpAMD64VPERMI2PS128
+		return true
+	case OpConcatPermuteFloat32x8:
+		v.Op = OpAMD64VPERMI2PS256
+		return true
+	case OpConcatPermuteFloat64x2:
+		v.Op = OpAMD64VPERMI2PD128
+		return true
+	case OpConcatPermuteFloat64x4:
+		v.Op = OpAMD64VPERMI2PD256
+		return true
+	case OpConcatPermuteFloat64x8:
+		v.Op = OpAMD64VPERMI2PD512
+		return true
+	case OpConcatPermuteInt16x16:
+		v.Op = OpAMD64VPERMI2W256
+		return true
+	case OpConcatPermuteInt16x32:
+		v.Op = OpAMD64VPERMI2W512
+		return true
+	case OpConcatPermuteInt16x8:
+		v.Op = OpAMD64VPERMI2W128
+		return true
+	case OpConcatPermuteInt32x16:
+		v.Op = OpAMD64VPERMI2D512
+		return true
+	case OpConcatPermuteInt32x4:
+		v.Op = OpAMD64VPERMI2D128
+		return true
+	case OpConcatPermuteInt32x8:
+		v.Op = OpAMD64VPERMI2D256
+		return true
+	case OpConcatPermuteInt64x2:
+		v.Op = OpAMD64VPERMI2Q128
+		return true
+	case OpConcatPermuteInt64x4:
+		v.Op = OpAMD64VPERMI2Q256
+		return true
+	case OpConcatPermuteInt64x8:
+		v.Op = OpAMD64VPERMI2Q512
+		return true
+	case OpConcatPermuteInt8x16:
+		v.Op = OpAMD64VPERMI2B128
+		return true
+	case OpConcatPermuteInt8x32:
+		v.Op = OpAMD64VPERMI2B256
+		return true
+	case OpConcatPermuteInt8x64:
+		v.Op = OpAMD64VPERMI2B512
+		return true
+	case OpConcatPermuteUint16x16:
+		v.Op = OpAMD64VPERMI2W256
+		return true
+	case OpConcatPermuteUint16x32:
+		v.Op = OpAMD64VPERMI2W512
+		return true
+	case OpConcatPermuteUint16x8:
+		v.Op = OpAMD64VPERMI2W128
+		return true
+	case OpConcatPermuteUint32x16:
+		v.Op = OpAMD64VPERMI2D512
+		return true
+	case OpConcatPermuteUint32x4:
+		v.Op = OpAMD64VPERMI2D128
+		return true
+	case OpConcatPermuteUint32x8:
+		v.Op = OpAMD64VPERMI2D256
+		return true
+	case OpConcatPermuteUint64x2:
+		v.Op = OpAMD64VPERMI2Q128
+		return true
+	case OpConcatPermuteUint64x4:
+		v.Op = OpAMD64VPERMI2Q256
+		return true
+	case OpConcatPermuteUint64x8:
+		v.Op = OpAMD64VPERMI2Q512
+		return true
+	case OpConcatPermuteUint8x16:
+		v.Op = OpAMD64VPERMI2B128
+		return true
+	case OpConcatPermuteUint8x32:
+		v.Op = OpAMD64VPERMI2B256
+		return true
+	case OpConcatPermuteUint8x64:
+		v.Op = OpAMD64VPERMI2B512
+		return true
 	case OpConcatShiftBytesRightGroupedUint8x32:
 		v.Op = OpAMD64VPALIGNR256
 		return true
@@ -4476,162 +4566,6 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpPanicBounds:
 		v.Op = OpAMD64LoweredPanicBoundsRR
 		return true
-	case OpPermute2Float32x16:
-		v.Op = OpAMD64VPERMI2PS512
-		return true
-	case OpPermute2Float32x4:
-		v.Op = OpAMD64VPERMI2PS128
-		return true
-	case OpPermute2Float32x8:
-		v.Op = OpAMD64VPERMI2PS256
-		return true
-	case OpPermute2Float64x2:
-		v.Op = OpAMD64VPERMI2PD128
-		return true
-	case OpPermute2Float64x4:
-		v.Op = OpAMD64VPERMI2PD256
-		return true
-	case OpPermute2Float64x8:
-		v.Op = OpAMD64VPERMI2PD512
-		return true
-	case OpPermute2Int16x16:
-		v.Op = OpAMD64VPERMI2W256
-		return true
-	case OpPermute2Int16x32:
-		v.Op = OpAMD64VPERMI2W512
-		return true
-	case OpPermute2Int16x8:
-		v.Op = OpAMD64VPERMI2W128
-		return true
-	case OpPermute2Int32x16:
-		v.Op = OpAMD64VPERMI2D512
-		return true
-	case OpPermute2Int32x4:
-		v.Op = OpAMD64VPERMI2D128
-		return true
-	case OpPermute2Int32x8:
-		v.Op = OpAMD64VPERMI2D256
-		return true
-	case OpPermute2Int64x2:
-		v.Op = OpAMD64VPERMI2Q128
-		return true
-	case OpPermute2Int64x4:
-		v.Op = OpAMD64VPERMI2Q256
-		return true
-	case OpPermute2Int64x8:
-		v.Op = OpAMD64VPERMI2Q512
-		return true
-	case OpPermute2Int8x16:
-		v.Op = OpAMD64VPERMI2B128
-		return true
-	case OpPermute2Int8x32:
-		v.Op = OpAMD64VPERMI2B256
-		return true
-	case OpPermute2Int8x64:
-		v.Op = OpAMD64VPERMI2B512
-		return true
-	case OpPermute2Uint16x16:
-		v.Op = OpAMD64VPERMI2W256
-		return true
-	case OpPermute2Uint16x32:
-		v.Op = OpAMD64VPERMI2W512
-		return true
-	case OpPermute2Uint16x8:
-		v.Op = OpAMD64VPERMI2W128
-		return true
-	case OpPermute2Uint32x16:
-		v.Op = OpAMD64VPERMI2D512
-		return true
-	case OpPermute2Uint32x4:
-		v.Op = OpAMD64VPERMI2D128
-		return true
-	case OpPermute2Uint32x8:
-		v.Op = OpAMD64VPERMI2D256
-		return true
-	case OpPermute2Uint64x2:
-		v.Op = OpAMD64VPERMI2Q128
-		return true
-	case OpPermute2Uint64x4:
-		v.Op = OpAMD64VPERMI2Q256
-		return true
-	case OpPermute2Uint64x8:
-		v.Op = OpAMD64VPERMI2Q512
-		return true
-	case OpPermute2Uint8x16:
-		v.Op = OpAMD64VPERMI2B128
-		return true
-	case OpPermute2Uint8x32:
-		v.Op = OpAMD64VPERMI2B256
-		return true
-	case OpPermute2Uint8x64:
-		v.Op = OpAMD64VPERMI2B512
-		return true
-	case OpPermuteConstantGroupedInt32x16:
-		v.Op = OpAMD64VPSHUFD512
-		return true
-	case OpPermuteConstantGroupedInt32x8:
-		v.Op = OpAMD64VPSHUFD256
-		return true
-	case OpPermuteConstantGroupedUint32x16:
-		v.Op = OpAMD64VPSHUFD512
-		return true
-	case OpPermuteConstantGroupedUint32x8:
-		v.Op = OpAMD64VPSHUFD256
-		return true
-	case OpPermuteConstantHiGroupedInt16x16:
-		v.Op = OpAMD64VPSHUFHW256
-		return true
-	case OpPermuteConstantHiGroupedInt16x32:
-		v.Op = OpAMD64VPSHUFHW512
-		return true
-	case OpPermuteConstantHiGroupedUint16x16:
-		v.Op = OpAMD64VPSHUFHW256
-		return true
-	case OpPermuteConstantHiGroupedUint16x32:
-		v.Op = OpAMD64VPSHUFHW512
-		return true
-	case OpPermuteConstantHiInt16x8:
-		v.Op = OpAMD64VPSHUFHW128
-		return true
-	case OpPermuteConstantHiInt32x4:
-		v.Op = OpAMD64VPSHUFHW128
-		return true
-	case OpPermuteConstantHiUint16x8:
-		v.Op = OpAMD64VPSHUFHW128
-		return true
-	case OpPermuteConstantHiUint32x4:
-		v.Op = OpAMD64VPSHUFHW128
-		return true
-	case OpPermuteConstantInt32x4:
-		v.Op = OpAMD64VPSHUFD128
-		return true
-	case OpPermuteConstantLoGroupedInt16x16:
-		v.Op = OpAMD64VPSHUFHW256
-		return true
-	case OpPermuteConstantLoGroupedInt16x32:
-		v.Op = OpAMD64VPSHUFHW512
-		return true
-	case OpPermuteConstantLoGroupedUint16x16:
-		v.Op = OpAMD64VPSHUFHW256
-		return true
-	case OpPermuteConstantLoGroupedUint16x32:
-		v.Op = OpAMD64VPSHUFHW512
-		return true
-	case OpPermuteConstantLoInt16x8:
-		v.Op = OpAMD64VPSHUFHW128
-		return true
-	case OpPermuteConstantLoInt32x4:
-		v.Op = OpAMD64VPSHUFHW128
-		return true
-	case OpPermuteConstantLoUint16x8:
-		v.Op = OpAMD64VPSHUFHW128
-		return true
-	case OpPermuteConstantLoUint32x4:
-		v.Op = OpAMD64VPSHUFHW128
-		return true
-	case OpPermuteConstantUint32x4:
-		v.Op = OpAMD64VPSHUFD128
-		return true
 	case OpPermuteFloat32x16:
 		v.Op = OpAMD64VPERMPS512
 		return true
@@ -4644,18 +4578,6 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpPermuteFloat64x8:
 		v.Op = OpAMD64VPERMPD512
 		return true
-	case OpPermuteGroupedInt8x32:
-		v.Op = OpAMD64VPSHUFB256
-		return true
-	case OpPermuteGroupedInt8x64:
-		v.Op = OpAMD64VPSHUFB512
-		return true
-	case OpPermuteGroupedUint8x32:
-		v.Op = OpAMD64VPSHUFB256
-		return true
-	case OpPermuteGroupedUint8x64:
-		v.Op = OpAMD64VPSHUFB512
-		return true
 	case OpPermuteInt16x16:
 		v.Op = OpAMD64VPERMW256
 		return true
@@ -4678,7 +4600,7 @@ func rewriteValueAMD64(v *Value) bool {
 		v.Op = OpAMD64VPERMQ512
 		return true
 	case OpPermuteInt8x16:
-		v.Op = OpAMD64VPSHUFB128
+		v.Op = OpAMD64VPERMB128
 		return true
 	case OpPermuteInt8x32:
 		v.Op = OpAMD64VPERMB256
@@ -4686,6 +4608,24 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpPermuteInt8x64:
 		v.Op = OpAMD64VPERMB512
 		return true
+	case OpPermuteOrZeroGroupedInt8x32:
+		v.Op = OpAMD64VPSHUFB256
+		return true
+	case OpPermuteOrZeroGroupedInt8x64:
+		v.Op = OpAMD64VPSHUFB512
+		return true
+	case OpPermuteOrZeroGroupedUint8x32:
+		v.Op = OpAMD64VPSHUFB256
+		return true
+	case OpPermuteOrZeroGroupedUint8x64:
+		v.Op = OpAMD64VPSHUFB512
+		return true
+	case OpPermuteOrZeroInt8x16:
+		v.Op = OpAMD64VPSHUFB128
+		return true
+	case OpPermuteOrZeroUint8x16:
+		v.Op = OpAMD64VPSHUFB128
+		return true
 	case OpPermuteUint16x16:
 		v.Op = OpAMD64VPERMW256
 		return true
@@ -4708,7 +4648,7 @@ func rewriteValueAMD64(v *Value) bool {
 		v.Op = OpAMD64VPERMQ512
 		return true
 	case OpPermuteUint8x16:
-		v.Op = OpAMD64VPSHUFB128
+		v.Op = OpAMD64VPERMB128
 		return true
 	case OpPermuteUint8x32:
 		v.Op = OpAMD64VPERMB256
@@ -6124,6 +6064,60 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpconcatSelectedConstantUint64x2:
 		v.Op = OpAMD64VSHUFPD128
 		return true
+	case OppermuteScalarsGroupedInt32x16:
+		v.Op = OpAMD64VPSHUFD512
+		return true
+	case OppermuteScalarsGroupedInt32x8:
+		v.Op = OpAMD64VPSHUFD256
+		return true
+	case OppermuteScalarsGroupedUint32x16:
+		v.Op = OpAMD64VPSHUFD512
+		return true
+	case OppermuteScalarsGroupedUint32x8:
+		v.Op = OpAMD64VPSHUFD256
+		return true
+	case OppermuteScalarsHiGroupedInt16x16:
+		v.Op = OpAMD64VPSHUFHW256
+		return true
+	case OppermuteScalarsHiGroupedInt16x32:
+		v.Op = OpAMD64VPSHUFHW512
+		return true
+	case OppermuteScalarsHiGroupedUint16x16:
+		v.Op = OpAMD64VPSHUFHW256
+		return true
+	case OppermuteScalarsHiGroupedUint16x32:
+		v.Op = OpAMD64VPSHUFHW512
+		return true
+	case OppermuteScalarsHiInt16x8:
+		v.Op = OpAMD64VPSHUFHW128
+		return true
+	case OppermuteScalarsHiUint16x8:
+		v.Op = OpAMD64VPSHUFHW128
+		return true
+	case OppermuteScalarsInt32x4:
+		v.Op = OpAMD64VPSHUFD128
+		return true
+	case OppermuteScalarsLoGroupedInt16x16:
+		v.Op = OpAMD64VPSHUFLW256
+		return true
+	case OppermuteScalarsLoGroupedInt16x32:
+		v.Op = OpAMD64VPSHUFLW512
+		return true
+	case OppermuteScalarsLoGroupedUint16x16:
+		v.Op = OpAMD64VPSHUFLW256
+		return true
+	case OppermuteScalarsLoGroupedUint16x32:
+		v.Op = OpAMD64VPSHUFLW512
+		return true
+	case OppermuteScalarsLoInt16x8:
+		v.Op = OpAMD64VPSHUFLW128
+		return true
+	case OppermuteScalarsLoUint16x8:
+		v.Op = OpAMD64VPSHUFLW128
+		return true
+	case OppermuteScalarsUint32x4:
+		v.Op = OpAMD64VPSHUFD128
+		return true
 	case OpternInt32x16:
 		v.Op = OpAMD64VPTERNLOGD512
 		return true
@@ -31247,6 +31241,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask)
+	// result: (VPERMI2WMasked128 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2W128 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2WMasked128)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU16Masked128 (VPMOVWB128_128 x) mask)
 	// result: (VPMOVWBMasked128_128 x mask)
 	for {
@@ -31460,34 +31468,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask)
-	// result: (VPERMI2WMasked128 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2W128 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2WMasked128)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask)
-	// result: (VPSHUFHWMasked128 [a] x mask)
-	for {
-		if v_0.Op != OpAMD64VPSHUFHW128 {
-			break
-		}
-		a := auxIntToUint8(v_0.AuxInt)
-		x := v_0.Args[0]
-		mask := v_1
-		v.reset(OpAMD64VPSHUFHWMasked128)
-		v.AuxInt = uint8ToAuxInt(a)
-		v.AddArg2(x, mask)
-		return true
-	}
 	// match: (VMOVDQU16Masked128 (VPERMW128 x y) mask)
 	// result: (VPERMWMasked128 x y mask)
 	for {
@@ -31676,6 +31656,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
+	// match: (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask)
+	// result: (VPSHUFHWMasked128 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFHW128 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFHWMasked128)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
+	// match: (VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask)
+	// result: (VPSHUFLWMasked128 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFLW128 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFLWMasked128)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask)
 	// result: (VPSLLWMasked128const [a] x mask)
 	for {
@@ -31785,6 +31793,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask)
+	// result: (VPERMI2WMasked256 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2W256 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2WMasked256)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU16Masked256 (VPMOVWB128_256 x) mask)
 	// result: (VPMOVWBMasked128_256 x mask)
 	for {
@@ -32034,34 +32056,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask)
-	// result: (VPERMI2WMasked256 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2W256 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2WMasked256)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask)
-	// result: (VPSHUFHWMasked256 [a] x mask)
-	for {
-		if v_0.Op != OpAMD64VPSHUFHW256 {
-			break
-		}
-		a := auxIntToUint8(v_0.AuxInt)
-		x := v_0.Args[0]
-		mask := v_1
-		v.reset(OpAMD64VPSHUFHWMasked256)
-		v.AuxInt = uint8ToAuxInt(a)
-		v.AddArg2(x, mask)
-		return true
-	}
 	// match: (VMOVDQU16Masked256 (VPERMW256 x y) mask)
 	// result: (VPERMWMasked256 x y mask)
 	for {
@@ -32250,6 +32244,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
+	// match: (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask)
+	// result: (VPSHUFHWMasked256 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFHW256 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFHWMasked256)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
+	// match: (VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask)
+	// result: (VPSHUFLWMasked256 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFLW256 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFLWMasked256)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask)
 	// result: (VPSLLWMasked256const [a] x mask)
 	for {
@@ -32359,6 +32381,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask)
+	// result: (VPERMI2WMasked512 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2W512 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2WMasked512)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU16Masked512 (VPMOVSXWD512 x) mask)
 	// result: (VPMOVSXWDMasked512 x mask)
 	for {
@@ -32536,34 +32572,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask)
-	// result: (VPERMI2WMasked512 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2W512 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2WMasked512)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask)
-	// result: (VPSHUFHWMasked512 [a] x mask)
-	for {
-		if v_0.Op != OpAMD64VPSHUFHW512 {
-			break
-		}
-		a := auxIntToUint8(v_0.AuxInt)
-		x := v_0.Args[0]
-		mask := v_1
-		v.reset(OpAMD64VPSHUFHWMasked512)
-		v.AuxInt = uint8ToAuxInt(a)
-		v.AddArg2(x, mask)
-		return true
-	}
 	// match: (VMOVDQU16Masked512 (VPERMW512 x y) mask)
 	// result: (VPERMWMasked512 x y mask)
 	for {
@@ -32752,6 +32760,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
+	// match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask)
+	// result: (VPSHUFHWMasked512 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFHW512 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFHWMasked512)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
+	// match: (VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask)
+	// result: (VPSHUFLWMasked512 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFLW512 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFLWMasked512)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask)
 	// result: (VPSLLWMasked512const [a] x mask)
 	for {
@@ -32875,6 +32911,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask)
+	// result: (VPERMI2PSMasked128 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2PS128 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2PSMasked128)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
+	// match: (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask)
+	// result: (VPERMI2DMasked128 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2D128 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2DMasked128)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked128 (VPMOVDB128_128 x) mask)
 	// result: (VPMOVDBMasked128_128 x mask)
 	for {
@@ -33232,48 +33296,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask)
-	// result: (VPERMI2PSMasked128 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2PS128 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2PSMasked128)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask)
-	// result: (VPERMI2DMasked128 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2D128 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2DMasked128)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask)
-	// result: (VPSHUFDMasked128 [a] x mask)
-	for {
-		if v_0.Op != OpAMD64VPSHUFD128 {
-			break
-		}
-		a := auxIntToUint8(v_0.AuxInt)
-		x := v_0.Args[0]
-		mask := v_1
-		v.reset(OpAMD64VPSHUFDMasked128)
-		v.AuxInt = uint8ToAuxInt(a)
-		v.AddArg2(x, mask)
-		return true
-	}
 	// match: (VMOVDQU32Masked128 (VPROLD128 [a] x) mask)
 	// result: (VPROLDMasked128 [a] x mask)
 	for {
@@ -33515,6 +33537,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask)
+	// result: (VPSHUFDMasked128 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFD128 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFDMasked128)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked128 (VPSLLD128const [a] x) mask)
 	// result: (VPSLLDMasked128const [a] x mask)
 	for {
@@ -33638,6 +33674,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask)
+	// result: (VPERMI2PSMasked256 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2PS256 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2PSMasked256)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
+	// match: (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask)
+	// result: (VPERMI2DMasked256 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2D256 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2DMasked256)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked256 (VPMOVDB128_256 x) mask)
 	// result: (VPMOVDBMasked128_256 x mask)
 	for {
@@ -34031,48 +34095,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask)
-	// result: (VPERMI2PSMasked256 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2PS256 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2PSMasked256)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask)
-	// result: (VPERMI2DMasked256 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2D256 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2DMasked256)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask)
-	// result: (VPSHUFDMasked256 [a] x mask)
-	for {
-		if v_0.Op != OpAMD64VPSHUFD256 {
-			break
-		}
-		a := auxIntToUint8(v_0.AuxInt)
-		x := v_0.Args[0]
-		mask := v_1
-		v.reset(OpAMD64VPSHUFDMasked256)
-		v.AuxInt = uint8ToAuxInt(a)
-		v.AddArg2(x, mask)
-		return true
-	}
 	// match: (VMOVDQU32Masked256 (VPERMPS256 x y) mask)
 	// result: (VPERMPSMasked256 x y mask)
 	for {
@@ -34340,6 +34362,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask)
+	// result: (VPSHUFDMasked256 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFD256 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFDMasked256)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked256 (VPSLLD256const [a] x) mask)
 	// result: (VPSLLDMasked256const [a] x mask)
 	for {
@@ -34489,6 +34525,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask)
+	// result: (VPERMI2PSMasked512 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2PS512 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2PSMasked512)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
+	// match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask)
+	// result: (VPERMI2DMasked512 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2D512 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2DMasked512)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked512 (VPMOVDB128_512 x) mask)
 	// result: (VPMOVDBMasked128_512 x mask)
 	for {
@@ -34823,48 +34887,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
-	// match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask)
-	// result: (VPERMI2PSMasked512 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2PS512 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2PSMasked512)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask)
-	// result: (VPERMI2DMasked512 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2D512 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2DMasked512)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask)
-	// result: (VPSHUFDMasked512 [a] x mask)
-	for {
-		if v_0.Op != OpAMD64VPSHUFD512 {
-			break
-		}
-		a := auxIntToUint8(v_0.AuxInt)
-		x := v_0.Args[0]
-		mask := v_1
-		v.reset(OpAMD64VPSHUFDMasked512)
-		v.AuxInt = uint8ToAuxInt(a)
-		v.AddArg2(x, mask)
-		return true
-	}
 	// match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask)
 	// result: (VPERMPSMasked512 x y mask)
 	for {
@@ -35169,6 +35191,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
+	// match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask)
+	// result: (VPSHUFDMasked512 [a] x mask)
+	for {
+		if v_0.Op != OpAMD64VPSHUFD512 {
+			break
+		}
+		a := auxIntToUint8(v_0.AuxInt)
+		x := v_0.Args[0]
+		mask := v_1
+		v.reset(OpAMD64VPSHUFDMasked512)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg2(x, mask)
+		return true
+	}
 	// match: (VMOVDQU32Masked512 (VPSLLD512const [a] x) mask)
 	// result: (VPSLLDMasked512const [a] x mask)
 	for {
@@ -35280,6 +35316,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask)
+	// result: (VPERMI2PDMasked128 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2PD128 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2PDMasked128)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
+	// match: (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask)
+	// result: (VPERMI2QMasked128 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2Q128 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2QMasked128)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU64Masked128 (VPMOVQB128_128 x) mask)
 	// result: (VPMOVQBMasked128_128 x mask)
 	for {
@@ -35571,34 +35635,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask)
-	// result: (VPERMI2PDMasked128 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2PD128 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2PDMasked128)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask)
-	// result: (VPERMI2QMasked128 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2Q128 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2QMasked128)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
 	// match: (VMOVDQU64Masked128 (VRCP14PD128 x) mask)
 	// result: (VRCP14PDMasked128 x mask)
 	for {
@@ -35987,6 +36023,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask)
+	// result: (VPERMI2PDMasked256 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2PD256 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2PDMasked256)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
+	// match: (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask)
+	// result: (VPERMI2QMasked256 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2Q256 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2QMasked256)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU64Masked256 (VPMOVQB128_256 x) mask)
 	// result: (VPMOVQBMasked128_256 x mask)
 	for {
@@ -36314,34 +36378,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask)
-	// result: (VPERMI2PDMasked256 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2PD256 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2PDMasked256)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask)
-	// result: (VPERMI2QMasked256 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2Q256 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2QMasked256)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
 	// match: (VMOVDQU64Masked256 (VPERMPD256 x y) mask)
 	// result: (VPERMPDMasked256 x y mask)
 	for {
@@ -36782,6 +36818,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask)
+	// result: (VPERMI2PDMasked512 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2PD512 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2PDMasked512)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
+	// match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask)
+	// result: (VPERMI2QMasked512 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2Q512 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2QMasked512)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU64Masked512 (VPMOVQB128_512 x) mask)
 	// result: (VPMOVQBMasked128_512 x mask)
 	for {
@@ -37050,34 +37114,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
-	// match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask)
-	// result: (VPERMI2PDMasked512 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2PD512 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2PDMasked512)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
-	// match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask)
-	// result: (VPERMI2QMasked512 x y z mask)
-	for {
-		if v_0.Op != OpAMD64VPERMI2Q512 {
-			break
-		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
-		y := v_0.Args[1]
-		mask := v_1
-		v.reset(OpAMD64VPERMI2QMasked512)
-		v.AddArg4(x, y, z, mask)
-		return true
-	}
 	// match: (VMOVDQU64Masked512 (VPERMPD512 x y) mask)
 	// result: (VPERMPDMasked512 x y mask)
 	for {
@@ -37491,6 +37527,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask)
+	// result: (VPERMI2BMasked128 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2B128 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2BMasked128)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask)
 	// result: (VPALIGNRMasked128 [a] x y mask)
 	for {
@@ -37685,18 +37735,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked128(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask)
-	// result: (VPERMI2BMasked128 x y z mask)
+	// match: (VMOVDQU8Masked128 (VPERMB128 x y) mask)
+	// result: (VPERMBMasked128 x y mask)
 	for {
-		if v_0.Op != OpAMD64VPERMI2B128 {
+		if v_0.Op != OpAMD64VPERMB128 {
 			break
 		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
 		y := v_0.Args[1]
+		x := v_0.Args[0]
 		mask := v_1
-		v.reset(OpAMD64VPERMI2BMasked128)
-		v.AddArg4(x, y, z, mask)
+		v.reset(OpAMD64VPERMBMasked128)
+		v.AddArg3(x, y, mask)
 		return true
 	}
 	// match: (VMOVDQU8Masked128 (VPSHUFB128 x y) mask)
@@ -37832,6 +37881,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask)
+	// result: (VPERMI2BMasked256 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2B256 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2BMasked256)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask)
 	// result: (VPALIGNRMasked256 [a] x y mask)
 	for {
@@ -38026,18 +38089,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask)
-	// result: (VPERMI2BMasked256 x y z mask)
+	// match: (VMOVDQU8Masked256 (VPERMB256 x y) mask)
+	// result: (VPERMBMasked256 x y mask)
 	for {
-		if v_0.Op != OpAMD64VPERMI2B256 {
+		if v_0.Op != OpAMD64VPERMB256 {
 			break
 		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
 		y := v_0.Args[1]
+		x := v_0.Args[0]
 		mask := v_1
-		v.reset(OpAMD64VPERMI2BMasked256)
-		v.AddArg4(x, y, z, mask)
+		v.reset(OpAMD64VPERMBMasked256)
+		v.AddArg3(x, y, mask)
 		return true
 	}
 	// match: (VMOVDQU8Masked256 (VPSHUFB256 x y) mask)
@@ -38053,19 +38115,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
-	// match: (VMOVDQU8Masked256 (VPERMB256 x y) mask)
-	// result: (VPERMBMasked256 x y mask)
-	for {
-		if v_0.Op != OpAMD64VPERMB256 {
-			break
-		}
-		y := v_0.Args[1]
-		x := v_0.Args[0]
-		mask := v_1
-		v.reset(OpAMD64VPERMBMasked256)
-		v.AddArg3(x, y, mask)
-		return true
-	}
 	// match: (VMOVDQU8Masked256 (VPSUBB256 x y) mask)
 	// result: (VPSUBBMasked256 x y mask)
 	for {
@@ -38186,6 +38235,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
+	// match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask)
+	// result: (VPERMI2BMasked512 x y z mask)
+	for {
+		if v_0.Op != OpAMD64VPERMI2B512 {
+			break
+		}
+		z := v_0.Args[2]
+		x := v_0.Args[0]
+		y := v_0.Args[1]
+		mask := v_1
+		v.reset(OpAMD64VPERMI2BMasked512)
+		v.AddArg4(x, y, z, mask)
+		return true
+	}
 	// match: (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask)
 	// result: (VPALIGNRMasked512 [a] x y mask)
 	for {
@@ -38380,18 +38443,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool {
 		v.AddArg2(x, mask)
 		return true
 	}
-	// match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask)
-	// result: (VPERMI2BMasked512 x y z mask)
+	// match: (VMOVDQU8Masked512 (VPERMB512 x y) mask)
+	// result: (VPERMBMasked512 x y mask)
 	for {
-		if v_0.Op != OpAMD64VPERMI2B512 {
+		if v_0.Op != OpAMD64VPERMB512 {
 			break
 		}
-		z := v_0.Args[2]
-		x := v_0.Args[0]
 		y := v_0.Args[1]
+		x := v_0.Args[0]
 		mask := v_1
-		v.reset(OpAMD64VPERMI2BMasked512)
-		v.AddArg4(x, y, z, mask)
+		v.reset(OpAMD64VPERMBMasked512)
+		v.AddArg3(x, y, mask)
 		return true
 	}
 	// match: (VMOVDQU8Masked512 (VPSHUFB512 x y) mask)
@@ -38407,19 +38469,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool {
 		v.AddArg3(x, y, mask)
 		return true
 	}
-	// match: (VMOVDQU8Masked512 (VPERMB512 x y) mask)
-	// result: (VPERMBMasked512 x y mask)
-	for {
-		if v_0.Op != OpAMD64VPERMB512 {
-			break
-		}
-		y := v_0.Args[1]
-		x := v_0.Args[0]
-		mask := v_1
-		v.reset(OpAMD64VPERMBMasked512)
-		v.AddArg3(x, y, mask)
-		return true
-	}
 	// match: (VMOVDQU8Masked512 (VPSUBB512 x y) mask)
 	// result: (VPSUBBMasked512 x y mask)
 	for {
@@ -42642,6 +42691,21 @@ func rewriteValueAMD64_OpAMD64VPBLENDMWMasked512(v *Value) bool {
 		v.AddArg3(dst, x, mask)
 		return true
 	}
+	// match: (VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask)
+	// result: (VPSHUFLWMasked512Merging dst [a] x mask)
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPSHUFLW512 {
+			break
+		}
+		a := auxIntToUint8(v_1.AuxInt)
+		x := v_1.Args[0]
+		mask := v_2
+		v.reset(OpAMD64VPSHUFLWMasked512Merging)
+		v.AuxInt = uint8ToAuxInt(a)
+		v.AddArg3(dst, x, mask)
+		return true
+	}
 	// match: (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask)
 	// result: (VPSLLVWMasked512Merging dst x y mask)
 	for {
@@ -45526,6 +45590,27 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool {
 		v.AddArg3(dst, x, v0)
 		return true
 	}
+	// match: (VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask)
+	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+	// result: (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPSHUFLW128 {
+			break
+		}
+		a := auxIntToUint8(v_1.AuxInt)
+		x := v_1.Args[0]
+		mask := v_2
+		if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+			break
+		}
+		v.reset(OpAMD64VPSHUFLWMasked128Merging)
+		v.AuxInt = uint8ToAuxInt(a)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(dst, x, v0)
+		return true
+	}
 	// match: (VPBLENDVB128 dst (VPSLLD128const [a] x) mask)
 	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
 	// result: (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
@@ -48223,6 +48308,27 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool {
 		v.AddArg3(dst, x, v0)
 		return true
 	}
+	// match: (VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask)
+	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+	// result: (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		dst := v_0
+		if v_1.Op != OpAMD64VPSHUFLW256 {
+			break
+		}
+		a := auxIntToUint8(v_1.AuxInt)
+		x := v_1.Args[0]
+		mask := v_2
+		if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+			break
+		}
+		v.reset(OpAMD64VPSHUFLWMasked256Merging)
+		v.AuxInt = uint8ToAuxInt(a)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(dst, x, v0)
+		return true
+	}
 	// match: (VPBLENDVB256 dst (VPSLLD256const [a] x) mask)
 	// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
 	// result: (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 818b3544ae..34e491371e 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -228,6 +228,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
@@ -802,8 +832,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
@@ -826,62 +856,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)
@@ -1300,6 +1280,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Int32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsInt32x4, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsUint32x4, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Int16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiInt16x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiUint16x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Int16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoInt16x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoUint16x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/gen_simdGenericOps.go b/src/simd/_gen/simdgen/gen_simdGenericOps.go
index 3dbbeb09f7..bcbc18b3b2 100644
--- a/src/simd/_gen/simdgen/gen_simdGenericOps.go
+++ b/src/simd/_gen/simdgen/gen_simdGenericOps.go
@@ -46,6 +46,9 @@ func writeSIMDGenericOps(ops []Operation) *bytes.Buffer {
 		if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
 			continue
 		}
+		if op.SkipMaskedMethod() {
+			continue
+		}
 		_, _, _, immType, gOp := op.shape()
 		gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
 		if immType == VarImm || immType == ConstVarImm {
diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
index b963fb9abb..04344dc831 100644
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@@ -107,6 +107,9 @@ func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
 		if op.NoTypes != nil && *op.NoTypes == "true" {
 			continue
 		}
+		if op.SkipMaskedMethod() {
+			continue
+		}
 		if s, op, err := classifyOp(op); err == nil {
 			if err := t.ExecuteTemplate(buffer, s, op); err != nil {
 				panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))
diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go
index 23b363d38a..dc5f77adaa 100644
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@@ -604,6 +604,9 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer)
 		if op.NoTypes != nil && *op.NoTypes == "true" {
 			continue
 		}
+		if op.SkipMaskedMethod() {
+			continue
+		}
 		idxVecAsScalar, err := checkVecAsScalar(op)
 		if err != nil {
 			panic(err)
diff --git a/src/simd/_gen/simdgen/gen_simdrules.go b/src/simd/_gen/simdgen/gen_simdrules.go
index 19393add71..5693496c92 100644
--- a/src/simd/_gen/simdgen/gen_simdrules.go
+++ b/src/simd/_gen/simdgen/gen_simdrules.go
@@ -345,7 +345,8 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
 			data.ArgsOut = "..."
 		}
 		data.tplName = tplName
-		if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" {
+		if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
+			opr.SkipMaskedMethod() {
 			optData = append(optData, data)
 			continue
 		}
diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go
index 7d3943b4b8..0b8fbd7e3d 100644
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@@ -73,6 +73,29 @@ type rawOperation struct {
 	NoGenericOps *string
 	// If non-nil, this string will be attached to the machine ssa op name.  E.g. "const"
 	SSAVariant *string
+	// If true, do not emit method declarations, generic ops, or intrinsics for masked variants
+	// DO emit the architecture-specific opcodes and optimizations.
+	HideMaskMethods *bool
+}
+
+func (o *Operation) IsMasked() bool {
+	if len(o.InVariant) == 0 {
+		return false
+	}
+	if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
+		return true
+	}
+	panic(fmt.Errorf("unknown inVariant"))
+}
+
+func (o *Operation) SkipMaskedMethod() bool {
+	if o.HideMaskMethods == nil {
+		return false
+	}
+	if *o.HideMaskMethods && o.IsMasked() {
+		return true
+	}
+	return false
 }
 
 func (o *Operation) DecodeUnified(v *unify.Value) error {
@@ -80,14 +103,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
 		return err
 	}
 
-	isMasked := false
-	if len(o.InVariant) == 0 {
-		// No variant
-	} else if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
-		isMasked = true
-	} else {
-		return fmt.Errorf("unknown inVariant")
-	}
+	isMasked := o.IsMasked()
 
 	// Compute full Go method name.
 	o.Go = o.rawOperation.Go
@@ -104,6 +120,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
 	o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go)
 	if isMasked {
 		o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
+		// Suppress generic op and method declaration for exported methods, if a mask is present.
 		if unicode.IsUpper([]rune(o.Go)[0]) {
 			trueVal := "true"
 			o.NoGenericOps = &trueVal
diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
index bb47819f2f..44bd8efb7f 100644
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@@ -27,18 +27,22 @@
   constImm: 1
   documentation: !string |-
     // NAME returns the upper half of x.
+- go: PermuteOrZero
+  commutative: false
+  documentation: !string |-
+    // NAME performs a full permutation of vector x using indices:
+    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 - go: Permute
   commutative: false
   documentation: !string |-
     // NAME performs a full permutation of vector x using indices:
     // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-    // Only the needed bits to represent x's index are used in indices' elements.
-- go: Permute2 # Permute2 is only available on or after AVX512
+- go: ConcatPermute # ConcatPermute is only available on or after AVX512
   commutative: false
   documentation: !string |-
     // NAME performs a full permutation of vector x, y using indices:
     // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-    // where xy is x appending y.
+    // where xy is the concatenation of x (lower half) and y (upper half).
     // Only the needed bits to represent xy's index are used in indices' elements.
 - go: Compress
   commutative: false
@@ -74,31 +78,35 @@
   documentation: !string |-
     // NAME copies element zero of its (128-bit) input to all elements of
     // the 512-bit output vector.
+- go: PermuteOrZeroGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using indices:
 - go: PermuteGrouped
   commutative: false
   documentation: !string |- # Detailed documentation will rely on the specific ops.
     // NAME performs a grouped permutation of vector x using indices:
-- go: PermuteConstant
+- go: permuteScalars
   commutative: false
   documentation: !string |- # Detailed documentation will rely on the specific ops.
     // NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantGrouped
+- go: permuteScalarsGrouped
   commutative: false
   documentation: !string |- # Detailed documentation will rely on the specific ops.
     // NAME performs a grouped permutation of vector x using constant indices:
-- go: PermuteConstantLo
+- go: permuteScalarsLo
   commutative: false
   documentation: !string |- # Detailed documentation will rely on the specific ops.
     // NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantLoGrouped
+- go: permuteScalarsLoGrouped
   commutative: false
   documentation: !string |- # Detailed documentation will rely on the specific ops.
     // NAME performs a grouped permutation of vector x using constant indices:
-- go: PermuteConstantHi
+- go: permuteScalarsHi
   commutative: false
   documentation: !string |- # Detailed documentation will rely on the specific ops.
     // NAME performs a permutation of vector x using constant indices:
-- go: PermuteConstantHiGrouped
+- go: permuteScalarsHiGrouped
   commutative: false
   documentation: !string |- # Detailed documentation will rely on the specific ops.
     // NAME performs a grouped permutation of vector x using constant indices:
@@ -218,8 +226,10 @@
 - go: Select128FromPair
   commutative: false
   documentation: !string |-
-    // NAME selects the low and high 128-bit halves from the 128-bit halves
-    // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+    // NAME treats the 256-bit vectors x and y as a single vector of four
+    // 128-bit elements, and returns a 256-bit result formed by 
+    // concatenating the two elements specified by lo and hi.
+    // For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}.
 
 - go: ConcatShiftBytesRight
   commutative: false
diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml
index 75fbc532b8..697d6a8bce 100644
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@@ -213,19 +213,75 @@
   - *f64xN
 
 - go: Permute
-  asm: "VPERM[BWDQ]|VPERMP[SD]"
+  asm: "VPERMQ|VPERMPD"
+  addDoc: !string |-
+    // The low 2 bits (values 0-3) of each element of indices is used
   operandOrder: "21Type1"
   in:
   - &anyindices
     go: $t
     name: indices
     overwriteBase: uint
+  - &any4
+    go: $t
+    lanes: 4
+  out:
   - &any
     go: $t
+
+- go: Permute
+  asm: "VPERM[WDQ]|VPERMP[SD]"
+  addDoc: !string |-
+    // The low 3 bits (values 0-7) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any8
+    go: $t
+    lanes: 8
+  out:
+  - *any
+
+- go: Permute
+  asm: "VPERM[BWD]|VPERMPS"
+  addDoc: !string |-
+    // The low 4 bits (values 0-15) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any16
+    go: $t
+    lanes: 16
   out:
   - *any
 
-- go: Permute2
+- go: Permute
+  asm: "VPERM[BW]"
+  addDoc: !string |-
+    // The low 5 bits (values 0-31) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any32
+    go: $t
+    lanes: 32
+  out:
+  - *any
+
+- go: Permute
+  asm: "VPERMB"
+  addDoc: !string |-
+    // The low 6 bits (values 0-63) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any64
+    go: $t
+    lanes: 64
+  out:
+  - *any
+
+- go: ConcatPermute
   asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
   # Because we are overwriting the receiver's type, we
   # have to move the receiver to be a parameter so that
@@ -403,113 +459,137 @@
     base: $b
 
 # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
-- go: Permute
+- go: PermuteOrZero
   asm: VPSHUFB
   addDoc: !string |-
-    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // The lower four bits of each byte-sized index in indices select an element from x,
+    // unless the index's sign bit is set in which case zero is used instead.
   in:
   - &128any
     bits: 128
     go: $t
   - bits: 128
-    go: $t
     name: indices
+    base: int # always signed
   out:
   - *128any
-- go: PermuteGrouped
+
+- go: PermuteOrZeroGrouped
   asm: VPSHUFB
   addDoc: !string |-
-    // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-    // Only the needed bits to represent the index of a group of x are used in indices' elements.
-    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+    // unless the index's sign bit is set in which case zero is used instead.
     // Each group is of size 128-bit.
   in:
   - &256Or512any
     bits: "256|512"
     go: $t
   - bits: "256|512"
-    go: $t
+    base: int
     name: indices
   out:
   - *256Or512any
 
-- go: PermuteConstant
+- go: permuteScalars
   asm: VPSHUFD
   addDoc: !string |-
-    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   in:
   - *128any
   - class: immediate
     immOffset: 0
     name: indices
+  hideMaskMethods: true
   out:
   - *128any
-- go: PermuteConstantGrouped
+
+- go: permuteScalarsGrouped
   asm: VPSHUFD
   addDoc: !string |-
-    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
     // Each group is of size 128-bit.
   in:
   - *256Or512any
   - class: immediate
     immOffset: 0
     name: indices
+  hideMaskMethods: true
   out:
   - *256Or512any
 
-- go: PermuteConstantLo
-  asm: VPSHUFHW
+- go: permuteScalarsLo
+  asm: VPSHUFLW
   addDoc: !string |-
-    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   in:
-    - *128any
+    - &128lanes8
+      bits: 128
+      go: $t
+      elemBits: 16
     - class: immediate
       immOffset: 0
       name: indices
+  hideMaskMethods: true
   out:
-    - *128any
-- go: PermuteConstantLoGrouped
-  asm: VPSHUFHW
+    - *128lanes8
+
+- go: permuteScalarsLoGrouped
+  asm: VPSHUFLW
   addDoc: !string |-
-    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    //
+    //   result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+    //    x_group1[indices[0:2]], ...}
+    //
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
     // Each group is of size 128-bit.
   in:
-  - *256Or512any
+  - &256Or512lanes8
+    bits: "256|512"
+    go: $t
+    elemBits: 16
   - class: immediate
     immOffset: 0
     name: indices
+  hideMaskMethods: true
   out:
-  - *256Or512any
+  - *256Or512lanes8
 
-- go: PermuteConstantHi
+- go: permuteScalarsHi
   asm: VPSHUFHW
   addDoc: !string |-
-    // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
   in:
-  - *128any
+  - *128lanes8
   - class: immediate
     immOffset: 0
     name: indices
+  hideMaskMethods: true
   out:
-  - *128any
-- go: PermuteConstantHiGrouped
+  - *128lanes8
+
+- go: permuteScalarsHiGrouped
   asm: VPSHUFHW
   addDoc: !string |-
-    // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result =
+    //
+    //   {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+    //    x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+    //
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
     // Each group is of size 128-bit.
   in:
-  - *256Or512any
+  - *256Or512lanes8
   - class: immediate
     immOffset: 0
     name: indices
+  hideMaskMethods: true
   out:
-  - *256Or512any
+  - *256Or512lanes8
 
 - go: InterleaveHi
   asm: VPUNPCKH(QDQ|DQ|WD|WB)
diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go
index 2d7793ef05..f51e3dc15f 100644
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@@ -163,7 +163,20 @@ func TestPermute(t *testing.T) {
 	}
 }
 
-func TestPermute2(t *testing.T) {
+func TestPermuteOrZero(t *testing.T) {
+	x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
+	want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
+	got := make([]uint8, len(x))
+	simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestConcatPermute(t *testing.T) {
 	if !simd.X86.AVX512() {
 		t.Skip("Test requires X86.AVX512, not available on this hardware")
 		return
@@ -173,7 +186,7 @@ func TestPermute2(t *testing.T) {
 	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
 	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
 	got := make([]int64, 8)
-	simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+	simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
 	for i := range 8 {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
@@ -1161,3 +1174,75 @@ func TestDotProductQuadruple(t *testing.T) {
 		}
 	}
 }
+
+func TestPermuteScalars(t *testing.T) {
+	x := []int32{11, 12, 13, 14}
+	want := []int32{12, 13, 14, 11}
+	got := make([]int32, 4)
+	simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
+	for i := range 4 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsGrouped(t *testing.T) {
+	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
+	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
+	got := make([]int32, 8)
+	simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsHi(t *testing.T) {
+	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
+	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
+	got := make([]int16, len(x))
+	simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsLo(t *testing.T) {
+	x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
+	want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
+	got := make([]int16, len(x))
+	simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsHiGrouped(t *testing.T) {
+	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
+	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
+	got := make([]int16, len(x))
+	simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsLoGrouped(t *testing.T) {
+	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
+	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
+	got := make([]int16, len(x))
+	simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go
index e06d1f652e..e9ddb463be 100644
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -1272,6 +1272,248 @@ func (x Uint64x4) Compress(mask Mask64x4) Uint64x4
 // Asm: VPCOMPRESSQ, CPU Feature: AVX512
 func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
 
+/* ConcatPermute */
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x16) ConcatPermute(y Int8x16, indices Uint8x16) Int8x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x16) ConcatPermute(y Uint8x16, indices Uint8x16) Uint8x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x32) ConcatPermute(y Int8x32, indices Uint8x32) Int8x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x32) ConcatPermute(y Uint8x32, indices Uint8x32) Uint8x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x64) ConcatPermute(y Int8x64, indices Uint8x64) Int8x64
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x64) ConcatPermute(y Uint8x64, indices Uint8x64) Uint8x64
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x8) ConcatPermute(y Int16x8, indices Uint16x8) Int16x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x8) ConcatPermute(y Uint16x8, indices Uint16x8) Uint16x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x16) ConcatPermute(y Int16x16, indices Uint16x16) Int16x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x16) ConcatPermute(y Uint16x16, indices Uint16x16) Uint16x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x32) ConcatPermute(y Int16x32, indices Uint16x32) Int16x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x32) ConcatPermute(y Uint16x32, indices Uint16x32) Uint16x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x4) ConcatPermute(y Float32x4, indices Uint32x4) Float32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x4) ConcatPermute(y Int32x4, indices Uint32x4) Int32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x4) ConcatPermute(y Uint32x4, indices Uint32x4) Uint32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x8) ConcatPermute(y Float32x8, indices Uint32x8) Float32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x8) ConcatPermute(y Int32x8, indices Uint32x8) Int32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x8) ConcatPermute(y Uint32x8, indices Uint32x8) Uint32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x16) ConcatPermute(y Float32x16, indices Uint32x16) Float32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x16) ConcatPermute(y Int32x16, indices Uint32x16) Int32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x16) ConcatPermute(y Uint32x16, indices Uint32x16) Uint32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x2) ConcatPermute(y Float64x2, indices Uint64x2) Float64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x2) ConcatPermute(y Int64x2, indices Uint64x2) Int64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x2) ConcatPermute(y Uint64x2, indices Uint64x2) Uint64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x4) ConcatPermute(y Float64x4, indices Uint64x4) Float64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x4) ConcatPermute(y Int64x4, indices Uint64x4) Int64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x4) ConcatPermute(y Uint64x4, indices Uint64x4) Uint64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x8) ConcatPermute(y Float64x8, indices Uint64x8) Float64x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x8) ConcatPermute(y Int64x8, indices Uint64x8) Int64x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8
+
 /* ConcatShiftBytesRight */
 
 // ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes.
@@ -4551,675 +4793,227 @@ func (x Uint64x8) Or(y Uint64x8) Uint64x8
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// The low 4 bits (values 0-15) of each element of indices is used
 //
-// Asm: VPSHUFB, CPU Feature: AVX
-func (x Int8x16) Permute(indices Int8x16) Int8x16
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute(indices Uint8x16) Int8x16
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// The low 4 bits (values 0-15) of each element of indices is used
 //
-// Asm: VPSHUFB, CPU Feature: AVX
+// Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x32) Permute(indices Uint8x32) Int8x32
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 6 bits (values 0-63) of each element of indices is used
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x64) Permute(indices Uint8x64) Int8x64
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 6 bits (values 0-63) of each element of indices is used
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Int16x8) Permute(indices Uint16x8) Int16x8
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Int16x16) Permute(indices Uint16x16) Int16x16
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Int16x32) Permute(indices Uint16x32) Int16x32
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 5 bits (values 0-31) of each element of indices is used
 //
 // Asm: VPERMW, CPU Feature: AVX512
 func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
 //
 // Asm: VPERMPS, CPU Feature: AVX2
 func (x Float32x8) Permute(indices Uint32x8) Float32x8
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
 //
 // Asm: VPERMD, CPU Feature: AVX2
 func (x Int32x8) Permute(indices Uint32x8) Int32x8
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
 //
 // Asm: VPERMD, CPU Feature: AVX2
 func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
 //
 // Asm: VPERMPS, CPU Feature: AVX512
 func (x Float32x16) Permute(indices Uint32x16) Float32x16
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
 //
 // Asm: VPERMD, CPU Feature: AVX512
 func (x Int32x16) Permute(indices Uint32x16) Int32x16
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 4 bits (values 0-15) of each element of indices is used
 //
 // Asm: VPERMD, CPU Feature: AVX512
 func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
 //
 // Asm: VPERMPD, CPU Feature: AVX512
 func (x Float64x4) Permute(indices Uint64x4) Float64x4
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
 //
 // Asm: VPERMQ, CPU Feature: AVX512
 func (x Int64x4) Permute(indices Uint64x4) Int64x4
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 2 bits (values 0-3) of each element of indices is used
 //
 // Asm: VPERMQ, CPU Feature: AVX512
 func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
 //
 // Asm: VPERMPD, CPU Feature: AVX512
 func (x Float64x8) Permute(indices Uint64x8) Float64x8
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
 //
 // Asm: VPERMQ, CPU Feature: AVX512
 func (x Int64x8) Permute(indices Uint64x8) Int64x8
 
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// Only the needed bits to represent x's index are used in indices' elements.
+// The low 3 bits (values 0-7) of each element of indices is used
 //
 // Asm: VPERMQ, CPU Feature: AVX512
 func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
 
-/* Permute2 */
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x16) Permute2(y Int8x16, indices Uint8x16) Int8x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x16) Permute2(y Uint8x16, indices Uint8x16) Uint8x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x32) Permute2(y Int8x32, indices Uint8x32) Int8x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x32) Permute2(y Uint8x32, indices Uint8x32) Uint8x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x64) Permute2(y Int8x64, indices Uint8x64) Int8x64
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x64) Permute2(y Uint8x64, indices Uint8x64) Uint8x64
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x8) Permute2(y Int16x8, indices Uint16x8) Int16x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x8) Permute2(y Uint16x8, indices Uint16x8) Uint16x8
+/* PermuteOrZero */
 
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x16) Permute2(y Int16x16, indices Uint16x16) Int16x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x16) Permute2(y Uint16x16, indices Uint16x16) Uint16x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x32) Permute2(y Int16x32, indices Uint16x32) Int16x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x32) Permute2(y Uint16x32, indices Uint16x32) Uint16x32
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x4) Permute2(y Float32x4, indices Uint32x4) Float32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x4) Permute2(y Int32x4, indices Uint32x4) Int32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x4) Permute2(y Uint32x4, indices Uint32x4) Uint32x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x8) Permute2(y Float32x8, indices Uint32x8) Float32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x8) Permute2(y Int32x8, indices Uint32x8) Int32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x8) Permute2(y Uint32x8, indices Uint32x8) Uint32x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x16) Permute2(y Float32x16, indices Uint32x16) Float32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x16) Permute2(y Int32x16, indices Uint32x16) Int32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x16) Permute2(y Uint32x16, indices Uint32x16) Uint32x16
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x2) Permute2(y Float64x2, indices Uint64x2) Float64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x2) Permute2(y Int64x2, indices Uint64x2) Int64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x2) Permute2(y Uint64x2, indices Uint64x2) Uint64x2
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x4) Permute2(y Float64x4, indices Uint64x4) Float64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x4) Permute2(y Int64x4, indices Uint64x4) Int64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x4) Permute2(y Uint64x4, indices Uint64x4) Uint64x4
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x8) Permute2(y Float64x8, indices Uint64x8) Float64x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
-
-// Permute2 performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is x appending y.
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
-
-/* PermuteConstant */
-
-// PermuteConstant performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Int32x4) PermuteConstant(indices uint8) Int32x4
-
-// PermuteConstant performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4
-
-/* PermuteConstantGrouped */
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8
-
-// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16
-
-/* PermuteConstantHi */
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8
-
-// PermuteConstantHi performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4
-
-/* PermuteConstantHiGrouped */
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16
-
-// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32
-
-/* PermuteConstantLo */
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8
-
-// PermuteConstantLo performs a permutation of vector x using constant indices:
-// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX
-func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4
-
-/* PermuteConstantLoGrouped */
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32
-
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// PermuteOrZero performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The lower four bits of each byte-sized index in indices select an element from x,
+// unless the index's sign bit is set in which case zero is used instead.
 //
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Int8x16) PermuteOrZero(indices Int8x16) Int8x16
 
-// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
-// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+// PermuteOrZero performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The lower four bits of each byte-sized index in indices select an element from x,
+// unless the index's sign bit is set in which case zero is used instead.
 //
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Uint8x16) PermuteOrZero(indices Int8x16) Uint8x16
 
-/* PermuteGrouped */
+/* PermuteOrZeroGrouped */
 
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
 // Each group is of size 128-bit.
 //
 // Asm: VPSHUFB, CPU Feature: AVX2
-func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32
+func (x Int8x32) PermuteOrZeroGrouped(indices Int8x32) Int8x32
 
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
 // Each group is of size 128-bit.
 //
 // Asm: VPSHUFB, CPU Feature: AVX512
-func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64
+func (x Int8x64) PermuteOrZeroGrouped(indices Int8x64) Int8x64
 
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
 // Each group is of size 128-bit.
 //
 // Asm: VPSHUFB, CPU Feature: AVX2
-func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32
+func (x Uint8x32) PermuteOrZeroGrouped(indices Int8x32) Uint8x32
 
-// PermuteGrouped performs a grouped permutation of vector x using indices:
-// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// Only the needed bits to represent the index of a group of x are used in indices' elements.
-// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
 // Each group is of size 128-bit.
 //
 // Asm: VPSHUFB, CPU Feature: AVX512
-func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64
+func (x Uint8x64) PermuteOrZeroGrouped(indices Int8x64) Uint8x64
 
 /* Reciprocal */
 
@@ -5807,8 +5601,10 @@ func (x Float64x8) Scale(y Float64x8) Float64x8
 
 /* Select128FromPair */
 
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
 //
 // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
 // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5816,8 +5612,10 @@ func (x Float64x8) Scale(y Float64x8) Float64x8
 // Asm: VPERM2F128, CPU Feature: AVX
 func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
 
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
 //
 // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
 // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5825,8 +5623,10 @@ func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
 // Asm: VPERM2F128, CPU Feature: AVX
 func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
 
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
 //
 // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
 // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5834,8 +5634,10 @@ func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
 // Asm: VPERM2I128, CPU Feature: AVX2
 func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
 
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
 //
 // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
 // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5843,8 +5645,10 @@ func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
 // Asm: VPERM2I128, CPU Feature: AVX2
 func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
 
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
 //
 // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
 // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
@@ -5852,8 +5656,10 @@ func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
 // Asm: VPERM2I128, CPU Feature: AVX2
 func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
 
-// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves
-// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}.
 //
 // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
 // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic.
diff --git a/src/simd/ops_internal_amd64.go b/src/simd/ops_internal_amd64.go
index 8be40995f0..63ee6416a6 100644
--- a/src/simd/ops_internal_amd64.go
+++ b/src/simd/ops_internal_amd64.go
@@ -338,6 +338,220 @@ func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x
 // Asm: VSHUFPD, CPU Feature: AVX512
 func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
 
+/* permuteScalars */
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) permuteScalars(indices uint8) Int32x4
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
+
+/* permuteScalarsGrouped */
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
+
+/* permuteScalarsHi */
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
+
+/* permuteScalarsHiGrouped */
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
+
+/* permuteScalarsLo */
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
+
+/* permuteScalarsLoGrouped */
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//	 x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//	 x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//	 x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//	 x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
+
 /* tern */
 
 // tern performs a logical operation on three vectors based on the 8-bit truth table.
diff --git a/src/simd/shuffles_amd64.go b/src/simd/shuffles_amd64.go
index e0d9db9266..b7472f7020 100644
--- a/src/simd/shuffles_amd64.go
+++ b/src/simd/shuffles_amd64.go
@@ -989,3 +989,280 @@ func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
 	}
 	panic("missing case, switch should be exhaustive")
 }
+
+/* PermuteScalars */
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
+	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
+	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsGrouped */
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
+	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
+	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
+	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
+	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHi */
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
+	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
+	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHiGrouped */
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
+	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		  {x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
+//			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+//			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
+	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//	  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
+//		x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
+	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		 {  x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
+//			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+//			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
+	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLo */
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
+	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
+	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLoGrouped */
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//	 {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
+//		 x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
+	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
+//		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
+//		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+//		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
+	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result = {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
+//		x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
+	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
+//		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
+//		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+//		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
+	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
-- 
2.52.0