From: Junyang Shao <shaojunyang@google.com>
Date: Mon, 14 Jul 2025 19:39:44 +0000 (+0000)
Subject: [dev.simd] cmd/compile, simd: add variable Permute
X-Git-Tag: go1.26rc1~147^2~175
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=01f7f57025;p=gostls13.git

[dev.simd] cmd/compile, simd: add variable Permute

This CL also added some tests for them.

This CL is generated by CL 687919.

Change-Id: I9ddd2cd23bb98ecca91bfbeaffd62faa4bd85e0d
Reviewed-on: https://go-review.googlesource.com/c/go/+/687939
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
---

diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 0ebb955acc..1a7e3be9e5 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -233,6 +233,20 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPHSUBW256,
 		ssa.OpAMD64VPHSUBD128,
 		ssa.OpAMD64VPHSUBD256,
+		ssa.OpAMD64VPERMB128,
+		ssa.OpAMD64VPERMB256,
+		ssa.OpAMD64VPERMB512,
+		ssa.OpAMD64VPERMW128,
+		ssa.OpAMD64VPERMW256,
+		ssa.OpAMD64VPERMW512,
+		ssa.OpAMD64VPERMPS256,
+		ssa.OpAMD64VPERMD256,
+		ssa.OpAMD64VPERMPS512,
+		ssa.OpAMD64VPERMD512,
+		ssa.OpAMD64VPERMPD256,
+		ssa.OpAMD64VPERMQ256,
+		ssa.OpAMD64VPERMPD512,
+		ssa.OpAMD64VPERMQ512,
 		ssa.OpAMD64VPROLVD128,
 		ssa.OpAMD64VPROLVD256,
 		ssa.OpAMD64VPROLVD512,
@@ -468,6 +482,20 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDWDMasked128,
 		ssa.OpAMD64VPMADDWDMasked256,
 		ssa.OpAMD64VPMADDWDMasked512,
+		ssa.OpAMD64VPERMBMasked128,
+		ssa.OpAMD64VPERMBMasked256,
+		ssa.OpAMD64VPERMBMasked512,
+		ssa.OpAMD64VPERMWMasked128,
+		ssa.OpAMD64VPERMWMasked256,
+		ssa.OpAMD64VPERMWMasked512,
+		ssa.OpAMD64VPERMPSMasked256,
+		ssa.OpAMD64VPERMDMasked256,
+		ssa.OpAMD64VPERMPSMasked512,
+		ssa.OpAMD64VPERMDMasked512,
+		ssa.OpAMD64VPERMPDMasked256,
+		ssa.OpAMD64VPERMQMasked256,
+		ssa.OpAMD64VPERMPDMasked512,
+		ssa.OpAMD64VPERMQMasked512,
 		ssa.OpAMD64VPROLVDMasked128,
 		ssa.OpAMD64VPROLVDMasked256,
 		ssa.OpAMD64VPROLVDMasked512,
@@ -766,6 +794,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPDPWSSD128,
 		ssa.OpAMD64VPDPWSSD256,
 		ssa.OpAMD64VPDPWSSD512,
+		ssa.OpAMD64VPERMI2B128,
+		ssa.OpAMD64VPERMI2B256,
+		ssa.OpAMD64VPERMI2B512,
+		ssa.OpAMD64VPERMI2W128,
+		ssa.OpAMD64VPERMI2W256,
+		ssa.OpAMD64VPERMI2W512,
+		ssa.OpAMD64VPERMI2PS128,
+		ssa.OpAMD64VPERMI2D128,
+		ssa.OpAMD64VPERMI2PS256,
+		ssa.OpAMD64VPERMI2D256,
+		ssa.OpAMD64VPERMI2PS512,
+		ssa.OpAMD64VPERMI2D512,
+		ssa.OpAMD64VPERMI2PD128,
+		ssa.OpAMD64VPERMI2Q128,
+		ssa.OpAMD64VPERMI2PD256,
+		ssa.OpAMD64VPERMI2Q256,
+		ssa.OpAMD64VPERMI2PD512,
+		ssa.OpAMD64VPERMI2Q512,
 		ssa.OpAMD64VPDPWSSDS128,
 		ssa.OpAMD64VPDPWSSDS256,
 		ssa.OpAMD64VPDPWSSDS512,
@@ -816,6 +862,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPDPWSSDMasked128,
 		ssa.OpAMD64VPDPWSSDMasked256,
 		ssa.OpAMD64VPDPWSSDMasked512,
+		ssa.OpAMD64VPERMI2BMasked128,
+		ssa.OpAMD64VPERMI2BMasked256,
+		ssa.OpAMD64VPERMI2BMasked512,
+		ssa.OpAMD64VPERMI2WMasked128,
+		ssa.OpAMD64VPERMI2WMasked256,
+		ssa.OpAMD64VPERMI2WMasked512,
+		ssa.OpAMD64VPERMI2PSMasked128,
+		ssa.OpAMD64VPERMI2DMasked128,
+		ssa.OpAMD64VPERMI2PSMasked256,
+		ssa.OpAMD64VPERMI2DMasked256,
+		ssa.OpAMD64VPERMI2PSMasked512,
+		ssa.OpAMD64VPERMI2DMasked512,
+		ssa.OpAMD64VPERMI2PDMasked128,
+		ssa.OpAMD64VPERMI2QMasked128,
+		ssa.OpAMD64VPERMI2PDMasked256,
+		ssa.OpAMD64VPERMI2QMasked256,
+		ssa.OpAMD64VPERMI2PDMasked512,
+		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPDPWSSDSMasked128,
 		ssa.OpAMD64VPDPWSSDSMasked256,
 		ssa.OpAMD64VPDPWSSDSMasked512,
@@ -1158,6 +1222,38 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDWDMasked128,
 		ssa.OpAMD64VPMADDWDMasked256,
 		ssa.OpAMD64VPMADDWDMasked512,
+		ssa.OpAMD64VPERMI2BMasked128,
+		ssa.OpAMD64VPERMI2BMasked256,
+		ssa.OpAMD64VPERMI2BMasked512,
+		ssa.OpAMD64VPERMI2WMasked128,
+		ssa.OpAMD64VPERMI2WMasked256,
+		ssa.OpAMD64VPERMI2WMasked512,
+		ssa.OpAMD64VPERMI2PSMasked128,
+		ssa.OpAMD64VPERMI2DMasked128,
+		ssa.OpAMD64VPERMI2PSMasked256,
+		ssa.OpAMD64VPERMI2DMasked256,
+		ssa.OpAMD64VPERMI2PSMasked512,
+		ssa.OpAMD64VPERMI2DMasked512,
+		ssa.OpAMD64VPERMI2PDMasked128,
+		ssa.OpAMD64VPERMI2QMasked128,
+		ssa.OpAMD64VPERMI2PDMasked256,
+		ssa.OpAMD64VPERMI2QMasked256,
+		ssa.OpAMD64VPERMI2PDMasked512,
+		ssa.OpAMD64VPERMI2QMasked512,
+		ssa.OpAMD64VPERMBMasked128,
+		ssa.OpAMD64VPERMBMasked256,
+		ssa.OpAMD64VPERMBMasked512,
+		ssa.OpAMD64VPERMWMasked128,
+		ssa.OpAMD64VPERMWMasked256,
+		ssa.OpAMD64VPERMWMasked512,
+		ssa.OpAMD64VPERMPSMasked256,
+		ssa.OpAMD64VPERMDMasked256,
+		ssa.OpAMD64VPERMPSMasked512,
+		ssa.OpAMD64VPERMDMasked512,
+		ssa.OpAMD64VPERMPDMasked256,
+		ssa.OpAMD64VPERMQMasked256,
+		ssa.OpAMD64VPERMPDMasked512,
+		ssa.OpAMD64VPERMQMasked512,
 		ssa.OpAMD64VPOPCNTBMasked128,
 		ssa.OpAMD64VPOPCNTBMasked256,
 		ssa.OpAMD64VPOPCNTBMasked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 0cbca8bf72..5898406e9d 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -985,6 +985,114 @@
 (PairwiseSubUint16x16 ...) => (VPHSUBW256 ...)
 (PairwiseSubUint32x4 ...) => (VPHSUBD128 ...)
 (PairwiseSubUint32x8 ...) => (VPHSUBD256 ...)
+(PermuteFloat32x8 ...) => (VPERMPS256 ...)
+(PermuteFloat32x16 ...) => (VPERMPS512 ...)
+(PermuteFloat64x4 ...) => (VPERMPD256 ...)
+(PermuteFloat64x8 ...) => (VPERMPD512 ...)
+(PermuteInt8x16 ...) => (VPERMB128 ...)
+(PermuteInt8x32 ...) => (VPERMB256 ...)
+(PermuteInt8x64 ...) => (VPERMB512 ...)
+(PermuteInt16x8 ...) => (VPERMW128 ...)
+(PermuteInt16x16 ...) => (VPERMW256 ...)
+(PermuteInt16x32 ...) => (VPERMW512 ...)
+(PermuteInt32x8 ...) => (VPERMD256 ...)
+(PermuteInt32x16 ...) => (VPERMD512 ...)
+(PermuteInt64x4 ...) => (VPERMQ256 ...)
+(PermuteInt64x8 ...) => (VPERMQ512 ...)
+(PermuteUint8x16 ...) => (VPERMB128 ...)
+(PermuteUint8x32 ...) => (VPERMB256 ...)
+(PermuteUint8x64 ...) => (VPERMB512 ...)
+(PermuteUint16x8 ...) => (VPERMW128 ...)
+(PermuteUint16x16 ...) => (VPERMW256 ...)
+(PermuteUint16x32 ...) => (VPERMW512 ...)
+(PermuteUint32x8 ...) => (VPERMD256 ...)
+(PermuteUint32x16 ...) => (VPERMD512 ...)
+(PermuteUint64x4 ...) => (VPERMQ256 ...)
+(PermuteUint64x8 ...) => (VPERMQ512 ...)
+(Permute2Float32x4 ...) => (VPERMI2PS128 ...)
+(Permute2Float32x8 ...) => (VPERMI2PS256 ...)
+(Permute2Float32x16 ...) => (VPERMI2PS512 ...)
+(Permute2Float64x2 ...) => (VPERMI2PD128 ...)
+(Permute2Float64x4 ...) => (VPERMI2PD256 ...)
+(Permute2Float64x8 ...) => (VPERMI2PD512 ...)
+(Permute2Int8x16 ...) => (VPERMI2B128 ...)
+(Permute2Int8x32 ...) => (VPERMI2B256 ...)
+(Permute2Int8x64 ...) => (VPERMI2B512 ...)
+(Permute2Int16x8 ...) => (VPERMI2W128 ...)
+(Permute2Int16x16 ...) => (VPERMI2W256 ...)
+(Permute2Int16x32 ...) => (VPERMI2W512 ...)
+(Permute2Int32x4 ...) => (VPERMI2D128 ...)
+(Permute2Int32x8 ...) => (VPERMI2D256 ...)
+(Permute2Int32x16 ...) => (VPERMI2D512 ...)
+(Permute2Int64x2 ...) => (VPERMI2Q128 ...)
+(Permute2Int64x4 ...) => (VPERMI2Q256 ...)
+(Permute2Int64x8 ...) => (VPERMI2Q512 ...)
+(Permute2Uint8x16 ...) => (VPERMI2B128 ...)
+(Permute2Uint8x32 ...) => (VPERMI2B256 ...)
+(Permute2Uint8x64 ...) => (VPERMI2B512 ...)
+(Permute2Uint16x8 ...) => (VPERMI2W128 ...)
+(Permute2Uint16x16 ...) => (VPERMI2W256 ...)
+(Permute2Uint16x32 ...) => (VPERMI2W512 ...)
+(Permute2Uint32x4 ...) => (VPERMI2D128 ...)
+(Permute2Uint32x8 ...) => (VPERMI2D256 ...)
+(Permute2Uint32x16 ...) => (VPERMI2D512 ...)
+(Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
+(Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
+(Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
+(Permute2MaskedFloat32x4 x y z mask) => (VPERMI2PSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Permute2MaskedFloat32x8 x y z mask) => (VPERMI2PSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(Permute2MaskedFloat32x16 x y z mask) => (VPERMI2PSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(Permute2MaskedFloat64x2 x y z mask) => (VPERMI2PDMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Permute2MaskedFloat64x4 x y z mask) => (VPERMI2PDMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+(Permute2MaskedFloat64x8 x y z mask) => (VPERMI2PDMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+(Permute2MaskedInt8x16 x y z mask) => (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Permute2MaskedInt8x32 x y z mask) => (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
+(Permute2MaskedInt8x64 x y z mask) => (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
+(Permute2MaskedInt16x8 x y z mask) => (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Permute2MaskedInt16x16 x y z mask) => (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
+(Permute2MaskedInt16x32 x y z mask) => (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
+(Permute2MaskedInt32x4 x y z mask) => (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Permute2MaskedInt32x8 x y z mask) => (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(Permute2MaskedInt32x16 x y z mask) => (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(Permute2MaskedInt64x2 x y z mask) => (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Permute2MaskedInt64x4 x y z mask) => (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+(Permute2MaskedInt64x8 x y z mask) => (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+(Permute2MaskedUint8x16 x y z mask) => (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Permute2MaskedUint8x32 x y z mask) => (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
+(Permute2MaskedUint8x64 x y z mask) => (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
+(Permute2MaskedUint16x8 x y z mask) => (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Permute2MaskedUint16x16 x y z mask) => (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
+(Permute2MaskedUint16x32 x y z mask) => (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
+(Permute2MaskedUint32x4 x y z mask) => (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Permute2MaskedUint32x8 x y z mask) => (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(Permute2MaskedUint32x16 x y z mask) => (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(Permute2MaskedUint64x2 x y z mask) => (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Permute2MaskedUint64x4 x y z mask) => (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+(Permute2MaskedUint64x8 x y z mask) => (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+(PermuteMaskedFloat32x8 x y mask) => (VPERMPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(PermuteMaskedFloat32x16 x y mask) => (VPERMPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(PermuteMaskedFloat64x4 x y mask) => (VPERMPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(PermuteMaskedFloat64x8 x y mask) => (VPERMPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(PermuteMaskedInt8x16 x y mask) => (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(PermuteMaskedInt8x32 x y mask) => (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(PermuteMaskedInt8x64 x y mask) => (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(PermuteMaskedInt16x8 x y mask) => (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(PermuteMaskedInt16x16 x y mask) => (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(PermuteMaskedInt16x32 x y mask) => (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(PermuteMaskedInt32x8 x y mask) => (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(PermuteMaskedInt32x16 x y mask) => (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(PermuteMaskedInt64x4 x y mask) => (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(PermuteMaskedInt64x8 x y mask) => (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(PermuteMaskedUint8x16 x y mask) => (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(PermuteMaskedUint8x32 x y mask) => (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(PermuteMaskedUint8x64 x y mask) => (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(PermuteMaskedUint16x8 x y mask) => (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(PermuteMaskedUint16x16 x y mask) => (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(PermuteMaskedUint16x32 x y mask) => (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(PermuteMaskedUint32x8 x y mask) => (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(PermuteMaskedUint32x16 x y mask) => (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(PermuteMaskedUint64x4 x y mask) => (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(PermuteMaskedUint64x8 x y mask) => (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (PopCountInt8x16 ...) => (VPOPCNTB128 ...)
 (PopCountInt8x32 ...) => (VPOPCNTB256 ...)
 (PopCountInt8x64 ...) => (VPOPCNTB512 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
index 6985daa04b..19ac0b0dea 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -613,6 +613,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUWMasked256", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULHUW256", argLength: 2, reg: v21, asm: "VPMULHUW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULHUWMasked256", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMW256", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMI2W256", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2WMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMWMasked256", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLW256", argLength: 2, reg: vfpv, asm: "VPSRLW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLWMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLVW256", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -625,6 +629,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUWMasked512", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULHUW512", argLength: 2, reg: w21, asm: "VPMULHUW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULHUWMasked512", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMW512", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMI2W512", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMI2WMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMWMasked512", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLW512", argLength: 2, reg: wfpw, asm: "VPSRLW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLWMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLVW512", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -637,6 +645,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUWMasked128", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULHUW128", argLength: 2, reg: v21, asm: "VPMULHUW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULHUWMasked128", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VPERMW128", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPERMI2W128", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2WMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMWMasked128", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLW128", argLength: 2, reg: vfpv, asm: "VPSRLW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLWMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLVW128", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -645,6 +657,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMAXUDMasked512", argLength: 3, reg: w2kw, asm: "VPMAXUD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUD512", argLength: 2, reg: w21, asm: "VPMINUD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUDMasked512", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMPS512", argLength: 2, reg: w21, asm: "VPERMPS", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMD512", argLength: 2, reg: w21, asm: "VPERMD", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMI2D512", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMI2PS512", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMI2DMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMI2PSMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMPSMasked512", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMDMasked512", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLD512", argLength: 2, reg: wfpw, asm: "VPSRLD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLDMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLVD512", argLength: 2, reg: w21, asm: "VPSRLVD", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -654,6 +674,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUD128", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUDMasked128", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2PS128", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2DMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSRLD128", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLDMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLVD128", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -663,6 +687,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUD256", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUDMasked256", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2PS256", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2PSMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMPSMasked256", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMDMasked256", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLD256", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLDMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLVD256", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -672,6 +704,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUQ128", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUQMasked128", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULUDQMasked128", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VPERMI2PD128", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2Q128", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2QMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2PDMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSRLQ128", argLength: 2, reg: vfpv, asm: "VPSRLQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLQMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLVQ128", argLength: 2, reg: v21, asm: "VPSRLVQ", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -681,6 +717,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUQ256", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUQMasked256", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMPD256", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMI2PD256", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2PDMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2QMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMPDMasked256", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMQMasked256", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLQ256", argLength: 2, reg: vfpv, asm: "VPSRLQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLQMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLVQ256", argLength: 2, reg: v21, asm: "VPSRLVQ", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -691,6 +735,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUQMasked512", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULUDQ512", argLength: 2, reg: w21, asm: "VPMULUDQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULUDQMasked512", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMPD512", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMQ512", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMI2Q512", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMI2PD512", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMI2QMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMI2PDMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMPDMasked512", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLQ512", argLength: 2, reg: wfpw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLQMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLVQ512", argLength: 2, reg: w21, asm: "VPSRLVQ", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -703,6 +755,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMAXUBMasked128", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUB128", argLength: 2, reg: v21, asm: "VPMINUB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUBMasked128", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec128", resultInArg0: false},
+		{name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPERMI2B128", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2BMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMADDUBSW128", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMADDUBSWMasked128", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPAVGB256", argLength: 2, reg: v21, asm: "VPAVGB", commutative: true, typ: "Vec256", resultInArg0: false},
@@ -713,6 +769,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMAXUBMasked256", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUB256", argLength: 2, reg: v21, asm: "VPMINUB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUBMasked256", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPERMI2B256", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2BMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMADDUBSW256", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMADDUBSWMasked256", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPAVGB512", argLength: 2, reg: w21, asm: "VPAVGB", commutative: true, typ: "Vec512", resultInArg0: false},
@@ -723,6 +783,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMAXUBMasked512", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUB512", argLength: 2, reg: w21, asm: "VPMINUB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUBMasked512", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMI2B512", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMI2BMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPMADDUBSW512", argLength: 2, reg: w21, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPMADDUBSWMasked512", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VRNDSCALEPS512", argLength: 1, reg: w11, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index a1dfc1e7da..dd27d0cc94 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -889,6 +889,14 @@ func simdGenericOps() []opData {
 		{name: "OrUint16x16", argLength: 2, commutative: true},
 		{name: "PairwiseAddUint16x16", argLength: 2, commutative: false},
 		{name: "PairwiseSubUint16x16", argLength: 2, commutative: false},
+		{name: "PermuteInt16x16", argLength: 2, commutative: false},
+		{name: "PermuteUint16x16", argLength: 2, commutative: false},
+		{name: "Permute2Uint16x16", argLength: 3, commutative: false},
+		{name: "Permute2Int16x16", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint16x16", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
+		{name: "PermuteMaskedUint16x16", argLength: 3, commutative: false},
+		{name: "PermuteMaskedInt16x16", argLength: 3, commutative: false},
 		{name: "PopCountUint16x16", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x16", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
@@ -932,6 +940,14 @@ func simdGenericOps() []opData {
 		{name: "MulHighMaskedUint16x32", argLength: 3, commutative: true},
 		{name: "NotEqualUint16x32", argLength: 2, commutative: true},
 		{name: "NotEqualMaskedUint16x32", argLength: 3, commutative: true},
+		{name: "PermuteUint16x32", argLength: 2, commutative: false},
+		{name: "PermuteInt16x32", argLength: 2, commutative: false},
+		{name: "Permute2Int16x32", argLength: 3, commutative: false},
+		{name: "Permute2Uint16x32", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint16x32", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt16x32", argLength: 4, commutative: false},
+		{name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
+		{name: "PermuteMaskedInt16x32", argLength: 3, commutative: false},
 		{name: "PopCountUint16x32", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x32", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
@@ -979,6 +995,14 @@ func simdGenericOps() []opData {
 		{name: "OrUint16x8", argLength: 2, commutative: true},
 		{name: "PairwiseAddUint16x8", argLength: 2, commutative: false},
 		{name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
+		{name: "PermuteUint16x8", argLength: 2, commutative: false},
+		{name: "PermuteInt16x8", argLength: 2, commutative: false},
+		{name: "Permute2Int16x8", argLength: 3, commutative: false},
+		{name: "Permute2Uint16x8", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint16x8", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt16x8", argLength: 4, commutative: false},
+		{name: "PermuteMaskedInt16x8", argLength: 3, commutative: false},
+		{name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
 		{name: "PopCountUint16x8", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x8", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
@@ -1024,6 +1048,18 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true},
 		{name: "OrUint32x16", argLength: 2, commutative: true},
 		{name: "OrMaskedUint32x16", argLength: 3, commutative: true},
+		{name: "PermuteInt32x16", argLength: 2, commutative: false},
+		{name: "PermuteUint32x16", argLength: 2, commutative: false},
+		{name: "PermuteFloat32x16", argLength: 2, commutative: false},
+		{name: "Permute2Int32x16", argLength: 3, commutative: false},
+		{name: "Permute2Uint32x16", argLength: 3, commutative: false},
+		{name: "Permute2Float32x16", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt32x16", argLength: 4, commutative: false},
+		{name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false},
+		{name: "PermuteMaskedUint32x16", argLength: 3, commutative: false},
+		{name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
+		{name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false},
 		{name: "PopCountUint32x16", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint32x16", argLength: 2, commutative: false},
 		{name: "RotateLeftUint32x16", argLength: 2, commutative: false},
@@ -1077,6 +1113,12 @@ func simdGenericOps() []opData {
 		{name: "OrMaskedUint32x4", argLength: 3, commutative: true},
 		{name: "PairwiseAddUint32x4", argLength: 2, commutative: false},
 		{name: "PairwiseSubUint32x4", argLength: 2, commutative: false},
+		{name: "Permute2Uint32x4", argLength: 3, commutative: false},
+		{name: "Permute2Float32x4", argLength: 3, commutative: false},
+		{name: "Permute2Int32x4", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint32x4", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt32x4", argLength: 4, commutative: false},
+		{name: "Permute2MaskedFloat32x4", argLength: 4, commutative: false},
 		{name: "PopCountUint32x4", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint32x4", argLength: 2, commutative: false},
 		{name: "RotateLeftUint32x4", argLength: 2, commutative: false},
@@ -1130,6 +1172,18 @@ func simdGenericOps() []opData {
 		{name: "OrMaskedUint32x8", argLength: 3, commutative: true},
 		{name: "PairwiseAddUint32x8", argLength: 2, commutative: false},
 		{name: "PairwiseSubUint32x8", argLength: 2, commutative: false},
+		{name: "PermuteInt32x8", argLength: 2, commutative: false},
+		{name: "PermuteFloat32x8", argLength: 2, commutative: false},
+		{name: "PermuteUint32x8", argLength: 2, commutative: false},
+		{name: "Permute2Uint32x8", argLength: 3, commutative: false},
+		{name: "Permute2Float32x8", argLength: 3, commutative: false},
+		{name: "Permute2Int32x8", argLength: 3, commutative: false},
+		{name: "Permute2MaskedFloat32x8", argLength: 4, commutative: false},
+		{name: "Permute2MaskedUint32x8", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt32x8", argLength: 4, commutative: false},
+		{name: "PermuteMaskedInt32x8", argLength: 3, commutative: false},
+		{name: "PermuteMaskedFloat32x8", argLength: 3, commutative: false},
+		{name: "PermuteMaskedUint32x8", argLength: 3, commutative: false},
 		{name: "PopCountUint32x8", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint32x8", argLength: 2, commutative: false},
 		{name: "RotateLeftUint32x8", argLength: 2, commutative: false},
@@ -1182,6 +1236,12 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint64x2", argLength: 3, commutative: true},
 		{name: "OrUint64x2", argLength: 2, commutative: true},
 		{name: "OrMaskedUint64x2", argLength: 3, commutative: true},
+		{name: "Permute2Uint64x2", argLength: 3, commutative: false},
+		{name: "Permute2Int64x2", argLength: 3, commutative: false},
+		{name: "Permute2Float64x2", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint64x2", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt64x2", argLength: 4, commutative: false},
+		{name: "Permute2MaskedFloat64x2", argLength: 4, commutative: false},
 		{name: "PopCountUint64x2", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint64x2", argLength: 2, commutative: false},
 		{name: "RotateLeftUint64x2", argLength: 2, commutative: false},
@@ -1230,6 +1290,18 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint64x4", argLength: 3, commutative: true},
 		{name: "OrUint64x4", argLength: 2, commutative: true},
 		{name: "OrMaskedUint64x4", argLength: 3, commutative: true},
+		{name: "PermuteUint64x4", argLength: 2, commutative: false},
+		{name: "PermuteInt64x4", argLength: 2, commutative: false},
+		{name: "PermuteFloat64x4", argLength: 2, commutative: false},
+		{name: "Permute2Uint64x4", argLength: 3, commutative: false},
+		{name: "Permute2Int64x4", argLength: 3, commutative: false},
+		{name: "Permute2Float64x4", argLength: 3, commutative: false},
+		{name: "Permute2MaskedInt64x4", argLength: 4, commutative: false},
+		{name: "Permute2MaskedUint64x4", argLength: 4, commutative: false},
+		{name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false},
+		{name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false},
+		{name: "PermuteMaskedInt64x4", argLength: 3, commutative: false},
+		{name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
 		{name: "PopCountUint64x4", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint64x4", argLength: 2, commutative: false},
 		{name: "RotateLeftUint64x4", argLength: 2, commutative: false},
@@ -1278,6 +1350,18 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true},
 		{name: "OrUint64x8", argLength: 2, commutative: true},
 		{name: "OrMaskedUint64x8", argLength: 3, commutative: true},
+		{name: "PermuteUint64x8", argLength: 2, commutative: false},
+		{name: "PermuteInt64x8", argLength: 2, commutative: false},
+		{name: "PermuteFloat64x8", argLength: 2, commutative: false},
+		{name: "Permute2Int64x8", argLength: 3, commutative: false},
+		{name: "Permute2Uint64x8", argLength: 3, commutative: false},
+		{name: "Permute2Float64x8", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint64x8", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt64x8", argLength: 4, commutative: false},
+		{name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
+		{name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
+		{name: "PermuteMaskedInt64x8", argLength: 3, commutative: false},
+		{name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
 		{name: "PopCountUint64x8", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint64x8", argLength: 2, commutative: false},
 		{name: "RotateLeftUint64x8", argLength: 2, commutative: false},
@@ -1325,6 +1409,14 @@ func simdGenericOps() []opData {
 		{name: "NotEqualUint8x16", argLength: 2, commutative: true},
 		{name: "NotEqualMaskedUint8x16", argLength: 3, commutative: true},
 		{name: "OrUint8x16", argLength: 2, commutative: true},
+		{name: "PermuteUint8x16", argLength: 2, commutative: false},
+		{name: "PermuteInt8x16", argLength: 2, commutative: false},
+		{name: "Permute2Uint8x16", argLength: 3, commutative: false},
+		{name: "Permute2Int8x16", argLength: 3, commutative: false},
+		{name: "Permute2MaskedInt8x16", argLength: 4, commutative: false},
+		{name: "Permute2MaskedUint8x16", argLength: 4, commutative: false},
+		{name: "PermuteMaskedInt8x16", argLength: 3, commutative: false},
+		{name: "PermuteMaskedUint8x16", argLength: 3, commutative: false},
 		{name: "PopCountUint8x16", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint8x16", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint8x16", argLength: 2, commutative: true},
@@ -1361,6 +1453,14 @@ func simdGenericOps() []opData {
 		{name: "NotEqualUint8x32", argLength: 2, commutative: true},
 		{name: "NotEqualMaskedUint8x32", argLength: 3, commutative: true},
 		{name: "OrUint8x32", argLength: 2, commutative: true},
+		{name: "PermuteUint8x32", argLength: 2, commutative: false},
+		{name: "PermuteInt8x32", argLength: 2, commutative: false},
+		{name: "Permute2Int8x32", argLength: 3, commutative: false},
+		{name: "Permute2Uint8x32", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint8x32", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt8x32", argLength: 4, commutative: false},
+		{name: "PermuteMaskedUint8x32", argLength: 3, commutative: false},
+		{name: "PermuteMaskedInt8x32", argLength: 3, commutative: false},
 		{name: "PopCountUint8x32", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint8x32", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint8x32", argLength: 2, commutative: true},
@@ -1394,6 +1494,14 @@ func simdGenericOps() []opData {
 		{name: "MinMaskedUint8x64", argLength: 3, commutative: true},
 		{name: "NotEqualUint8x64", argLength: 2, commutative: true},
 		{name: "NotEqualMaskedUint8x64", argLength: 3, commutative: true},
+		{name: "PermuteUint8x64", argLength: 2, commutative: false},
+		{name: "PermuteInt8x64", argLength: 2, commutative: false},
+		{name: "Permute2Int8x64", argLength: 3, commutative: false},
+		{name: "Permute2Uint8x64", argLength: 3, commutative: false},
+		{name: "Permute2MaskedUint8x64", argLength: 4, commutative: false},
+		{name: "Permute2MaskedInt8x64", argLength: 4, commutative: false},
+		{name: "PermuteMaskedInt8x64", argLength: 3, commutative: false},
+		{name: "PermuteMaskedUint8x64", argLength: 3, commutative: false},
 		{name: "PopCountUint8x64", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint8x64", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint8x64", argLength: 2, commutative: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index ba28c58b7e..60a12e21fb 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1808,6 +1808,10 @@ const (
 	OpAMD64VPMINUWMasked256
 	OpAMD64VPMULHUW256
 	OpAMD64VPMULHUWMasked256
+	OpAMD64VPERMW256
+	OpAMD64VPERMI2W256
+	OpAMD64VPERMI2WMasked256
+	OpAMD64VPERMWMasked256
 	OpAMD64VPSRLW256
 	OpAMD64VPSRLWMasked256
 	OpAMD64VPSRLVW256
@@ -1820,6 +1824,10 @@ const (
 	OpAMD64VPMINUWMasked512
 	OpAMD64VPMULHUW512
 	OpAMD64VPMULHUWMasked512
+	OpAMD64VPERMW512
+	OpAMD64VPERMI2W512
+	OpAMD64VPERMI2WMasked512
+	OpAMD64VPERMWMasked512
 	OpAMD64VPSRLW512
 	OpAMD64VPSRLWMasked512
 	OpAMD64VPSRLVW512
@@ -1832,6 +1840,10 @@ const (
 	OpAMD64VPMINUWMasked128
 	OpAMD64VPMULHUW128
 	OpAMD64VPMULHUWMasked128
+	OpAMD64VPERMW128
+	OpAMD64VPERMI2W128
+	OpAMD64VPERMI2WMasked128
+	OpAMD64VPERMWMasked128
 	OpAMD64VPSRLW128
 	OpAMD64VPSRLWMasked128
 	OpAMD64VPSRLVW128
@@ -1840,6 +1852,14 @@ const (
 	OpAMD64VPMAXUDMasked512
 	OpAMD64VPMINUD512
 	OpAMD64VPMINUDMasked512
+	OpAMD64VPERMPS512
+	OpAMD64VPERMD512
+	OpAMD64VPERMI2D512
+	OpAMD64VPERMI2PS512
+	OpAMD64VPERMI2DMasked512
+	OpAMD64VPERMI2PSMasked512
+	OpAMD64VPERMPSMasked512
+	OpAMD64VPERMDMasked512
 	OpAMD64VPSRLD512
 	OpAMD64VPSRLDMasked512
 	OpAMD64VPSRLVD512
@@ -1849,6 +1869,10 @@ const (
 	OpAMD64VPMINUD128
 	OpAMD64VPMINUDMasked128
 	OpAMD64VPMULUDQ128
+	OpAMD64VPERMI2D128
+	OpAMD64VPERMI2PS128
+	OpAMD64VPERMI2PSMasked128
+	OpAMD64VPERMI2DMasked128
 	OpAMD64VPSRLD128
 	OpAMD64VPSRLDMasked128
 	OpAMD64VPSRLVD128
@@ -1858,6 +1882,14 @@ const (
 	OpAMD64VPMINUD256
 	OpAMD64VPMINUDMasked256
 	OpAMD64VPMULUDQ256
+	OpAMD64VPERMD256
+	OpAMD64VPERMPS256
+	OpAMD64VPERMI2D256
+	OpAMD64VPERMI2PS256
+	OpAMD64VPERMI2PSMasked256
+	OpAMD64VPERMI2DMasked256
+	OpAMD64VPERMPSMasked256
+	OpAMD64VPERMDMasked256
 	OpAMD64VPSRLD256
 	OpAMD64VPSRLDMasked256
 	OpAMD64VPSRLVD256
@@ -1867,6 +1899,10 @@ const (
 	OpAMD64VPMINUQ128
 	OpAMD64VPMINUQMasked128
 	OpAMD64VPMULUDQMasked128
+	OpAMD64VPERMI2PD128
+	OpAMD64VPERMI2Q128
+	OpAMD64VPERMI2QMasked128
+	OpAMD64VPERMI2PDMasked128
 	OpAMD64VPSRLQ128
 	OpAMD64VPSRLQMasked128
 	OpAMD64VPSRLVQ128
@@ -1876,6 +1912,14 @@ const (
 	OpAMD64VPMINUQ256
 	OpAMD64VPMINUQMasked256
 	OpAMD64VPMULUDQMasked256
+	OpAMD64VPERMQ256
+	OpAMD64VPERMPD256
+	OpAMD64VPERMI2PD256
+	OpAMD64VPERMI2Q256
+	OpAMD64VPERMI2PDMasked256
+	OpAMD64VPERMI2QMasked256
+	OpAMD64VPERMPDMasked256
+	OpAMD64VPERMQMasked256
 	OpAMD64VPSRLQ256
 	OpAMD64VPSRLQMasked256
 	OpAMD64VPSRLVQ256
@@ -1886,6 +1930,14 @@ const (
 	OpAMD64VPMINUQMasked512
 	OpAMD64VPMULUDQ512
 	OpAMD64VPMULUDQMasked512
+	OpAMD64VPERMPD512
+	OpAMD64VPERMQ512
+	OpAMD64VPERMI2Q512
+	OpAMD64VPERMI2PD512
+	OpAMD64VPERMI2QMasked512
+	OpAMD64VPERMI2PDMasked512
+	OpAMD64VPERMPDMasked512
+	OpAMD64VPERMQMasked512
 	OpAMD64VPSRLQ512
 	OpAMD64VPSRLQMasked512
 	OpAMD64VPSRLVQ512
@@ -1898,6 +1950,10 @@ const (
 	OpAMD64VPMAXUBMasked128
 	OpAMD64VPMINUB128
 	OpAMD64VPMINUBMasked128
+	OpAMD64VPERMB128
+	OpAMD64VPERMI2B128
+	OpAMD64VPERMI2BMasked128
+	OpAMD64VPERMBMasked128
 	OpAMD64VPMADDUBSW128
 	OpAMD64VPMADDUBSWMasked128
 	OpAMD64VPAVGB256
@@ -1908,6 +1964,10 @@ const (
 	OpAMD64VPMAXUBMasked256
 	OpAMD64VPMINUB256
 	OpAMD64VPMINUBMasked256
+	OpAMD64VPERMB256
+	OpAMD64VPERMI2B256
+	OpAMD64VPERMI2BMasked256
+	OpAMD64VPERMBMasked256
 	OpAMD64VPMADDUBSW256
 	OpAMD64VPMADDUBSWMasked256
 	OpAMD64VPAVGB512
@@ -1918,6 +1978,10 @@ const (
 	OpAMD64VPMAXUBMasked512
 	OpAMD64VPMINUB512
 	OpAMD64VPMINUBMasked512
+	OpAMD64VPERMB512
+	OpAMD64VPERMI2B512
+	OpAMD64VPERMI2BMasked512
+	OpAMD64VPERMBMasked512
 	OpAMD64VPMADDUBSW512
 	OpAMD64VPMADDUBSWMasked512
 	OpAMD64VRNDSCALEPS512
@@ -5207,6 +5271,14 @@ const (
 	OpOrUint16x16
 	OpPairwiseAddUint16x16
 	OpPairwiseSubUint16x16
+	OpPermuteInt16x16
+	OpPermuteUint16x16
+	OpPermute2Uint16x16
+	OpPermute2Int16x16
+	OpPermute2MaskedUint16x16
+	OpPermute2MaskedInt16x16
+	OpPermuteMaskedUint16x16
+	OpPermuteMaskedInt16x16
 	OpPopCountUint16x16
 	OpPopCountMaskedUint16x16
 	OpSaturatedAddUint16x16
@@ -5250,6 +5322,14 @@ const (
 	OpMulHighMaskedUint16x32
 	OpNotEqualUint16x32
 	OpNotEqualMaskedUint16x32
+	OpPermuteUint16x32
+	OpPermuteInt16x32
+	OpPermute2Int16x32
+	OpPermute2Uint16x32
+	OpPermute2MaskedUint16x32
+	OpPermute2MaskedInt16x32
+	OpPermuteMaskedUint16x32
+	OpPermuteMaskedInt16x32
 	OpPopCountUint16x32
 	OpPopCountMaskedUint16x32
 	OpSaturatedAddUint16x32
@@ -5297,6 +5377,14 @@ const (
 	OpOrUint16x8
 	OpPairwiseAddUint16x8
 	OpPairwiseSubUint16x8
+	OpPermuteUint16x8
+	OpPermuteInt16x8
+	OpPermute2Int16x8
+	OpPermute2Uint16x8
+	OpPermute2MaskedUint16x8
+	OpPermute2MaskedInt16x8
+	OpPermuteMaskedInt16x8
+	OpPermuteMaskedUint16x8
 	OpPopCountUint16x8
 	OpPopCountMaskedUint16x8
 	OpSaturatedAddUint16x8
@@ -5342,6 +5430,18 @@ const (
 	OpNotEqualMaskedUint32x16
 	OpOrUint32x16
 	OpOrMaskedUint32x16
+	OpPermuteInt32x16
+	OpPermuteUint32x16
+	OpPermuteFloat32x16
+	OpPermute2Int32x16
+	OpPermute2Uint32x16
+	OpPermute2Float32x16
+	OpPermute2MaskedUint32x16
+	OpPermute2MaskedInt32x16
+	OpPermute2MaskedFloat32x16
+	OpPermuteMaskedUint32x16
+	OpPermuteMaskedInt32x16
+	OpPermuteMaskedFloat32x16
 	OpPopCountUint32x16
 	OpPopCountMaskedUint32x16
 	OpRotateLeftUint32x16
@@ -5395,6 +5495,12 @@ const (
 	OpOrMaskedUint32x4
 	OpPairwiseAddUint32x4
 	OpPairwiseSubUint32x4
+	OpPermute2Uint32x4
+	OpPermute2Float32x4
+	OpPermute2Int32x4
+	OpPermute2MaskedUint32x4
+	OpPermute2MaskedInt32x4
+	OpPermute2MaskedFloat32x4
 	OpPopCountUint32x4
 	OpPopCountMaskedUint32x4
 	OpRotateLeftUint32x4
@@ -5448,6 +5554,18 @@ const (
 	OpOrMaskedUint32x8
 	OpPairwiseAddUint32x8
 	OpPairwiseSubUint32x8
+	OpPermuteInt32x8
+	OpPermuteFloat32x8
+	OpPermuteUint32x8
+	OpPermute2Uint32x8
+	OpPermute2Float32x8
+	OpPermute2Int32x8
+	OpPermute2MaskedFloat32x8
+	OpPermute2MaskedUint32x8
+	OpPermute2MaskedInt32x8
+	OpPermuteMaskedInt32x8
+	OpPermuteMaskedFloat32x8
+	OpPermuteMaskedUint32x8
 	OpPopCountUint32x8
 	OpPopCountMaskedUint32x8
 	OpRotateLeftUint32x8
@@ -5500,6 +5618,12 @@ const (
 	OpNotEqualMaskedUint64x2
 	OpOrUint64x2
 	OpOrMaskedUint64x2
+	OpPermute2Uint64x2
+	OpPermute2Int64x2
+	OpPermute2Float64x2
+	OpPermute2MaskedUint64x2
+	OpPermute2MaskedInt64x2
+	OpPermute2MaskedFloat64x2
 	OpPopCountUint64x2
 	OpPopCountMaskedUint64x2
 	OpRotateLeftUint64x2
@@ -5548,6 +5672,18 @@ const (
 	OpNotEqualMaskedUint64x4
 	OpOrUint64x4
 	OpOrMaskedUint64x4
+	OpPermuteUint64x4
+	OpPermuteInt64x4
+	OpPermuteFloat64x4
+	OpPermute2Uint64x4
+	OpPermute2Int64x4
+	OpPermute2Float64x4
+	OpPermute2MaskedInt64x4
+	OpPermute2MaskedUint64x4
+	OpPermute2MaskedFloat64x4
+	OpPermuteMaskedFloat64x4
+	OpPermuteMaskedInt64x4
+	OpPermuteMaskedUint64x4
 	OpPopCountUint64x4
 	OpPopCountMaskedUint64x4
 	OpRotateLeftUint64x4
@@ -5596,6 +5732,18 @@ const (
 	OpNotEqualMaskedUint64x8
 	OpOrUint64x8
 	OpOrMaskedUint64x8
+	OpPermuteUint64x8
+	OpPermuteInt64x8
+	OpPermuteFloat64x8
+	OpPermute2Int64x8
+	OpPermute2Uint64x8
+	OpPermute2Float64x8
+	OpPermute2MaskedUint64x8
+	OpPermute2MaskedInt64x8
+	OpPermute2MaskedFloat64x8
+	OpPermuteMaskedFloat64x8
+	OpPermuteMaskedInt64x8
+	OpPermuteMaskedUint64x8
 	OpPopCountUint64x8
 	OpPopCountMaskedUint64x8
 	OpRotateLeftUint64x8
@@ -5643,6 +5791,14 @@ const (
 	OpNotEqualUint8x16
 	OpNotEqualMaskedUint8x16
 	OpOrUint8x16
+	OpPermuteUint8x16
+	OpPermuteInt8x16
+	OpPermute2Uint8x16
+	OpPermute2Int8x16
+	OpPermute2MaskedInt8x16
+	OpPermute2MaskedUint8x16
+	OpPermuteMaskedInt8x16
+	OpPermuteMaskedUint8x16
 	OpPopCountUint8x16
 	OpPopCountMaskedUint8x16
 	OpSaturatedAddUint8x16
@@ -5679,6 +5835,14 @@ const (
 	OpNotEqualUint8x32
 	OpNotEqualMaskedUint8x32
 	OpOrUint8x32
+	OpPermuteUint8x32
+	OpPermuteInt8x32
+	OpPermute2Int8x32
+	OpPermute2Uint8x32
+	OpPermute2MaskedUint8x32
+	OpPermute2MaskedInt8x32
+	OpPermuteMaskedUint8x32
+	OpPermuteMaskedInt8x32
 	OpPopCountUint8x32
 	OpPopCountMaskedUint8x32
 	OpSaturatedAddUint8x32
@@ -5712,6 +5876,14 @@ const (
 	OpMinMaskedUint8x64
 	OpNotEqualUint8x64
 	OpNotEqualMaskedUint8x64
+	OpPermuteUint8x64
+	OpPermuteInt8x64
+	OpPermute2Int8x64
+	OpPermute2Uint8x64
+	OpPermute2MaskedUint8x64
+	OpPermute2MaskedInt8x64
+	OpPermuteMaskedInt8x64
+	OpPermuteMaskedUint8x64
 	OpPopCountUint8x64
 	OpPopCountMaskedUint8x64
 	OpSaturatedAddUint8x64
@@ -27735,6 +27907,68 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMW256",
+		argLen: 2,
+		asm:    x86.AVPERMW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2W256",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2W,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2WMasked256",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2W,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMWMasked256",
+		argLen: 3,
+		asm:    x86.AVPERMW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLW256",
 		argLen: 2,
@@ -27917,6 +28151,68 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMW512",
+		argLen: 2,
+		asm:    x86.AVPERMW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2W512",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2W,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2WMasked512",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2W,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMWMasked512",
+		argLen: 3,
+		asm:    x86.AVPERMW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLW512",
 		argLen: 2,
@@ -28099,6 +28395,68 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMW128",
+		argLen: 2,
+		asm:    x86.AVPERMW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2W128",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2W,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2WMasked128",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2W,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMWMasked128",
+		argLen: 3,
+		asm:    x86.AVPERMW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLW128",
 		argLen: 2,
@@ -28219,6 +28577,130 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMPS512",
+		argLen: 2,
+		asm:    x86.AVPERMPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPERMD512",
+		argLen: 2,
+		asm:    x86.AVPERMD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2D512",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2D,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PS512",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2DMasked512",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2D,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PSMasked512",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMPSMasked512",
+		argLen: 3,
+		asm:    x86.AVPERMPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMDMasked512",
+		argLen: 3,
+		asm:    x86.AVPERMD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLD512",
 		argLen: 2,
@@ -28354,6 +28836,72 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:         "VPERMI2D128",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2D,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PS128",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PSMasked128",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2DMasked128",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2D,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLD128",
 		argLen: 2,
@@ -28489,6 +29037,130 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMD256",
+		argLen: 2,
+		asm:    x86.AVPERMD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMPS256",
+		argLen: 2,
+		asm:    x86.AVPERMPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2D256",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2D,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PS256",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PSMasked256",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2DMasked256",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2D,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMPSMasked256",
+		argLen: 3,
+		asm:    x86.AVPERMPS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMDMasked256",
+		argLen: 3,
+		asm:    x86.AVPERMD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLD256",
 		argLen: 2,
@@ -28625,6 +29297,72 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:         "VPERMI2PD128",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2Q128",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2Q,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2QMasked128",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2Q,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PDMasked128",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLQ128",
 		argLen: 2,
@@ -28761,6 +29499,130 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMQ256",
+		argLen: 2,
+		asm:    x86.AVPERMQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPERMPD256",
+		argLen: 2,
+		asm:    x86.AVPERMPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PD256",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2Q256",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2Q,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PDMasked256",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2QMasked256",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2Q,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMPDMasked256",
+		argLen: 3,
+		asm:    x86.AVPERMPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMQMasked256",
+		argLen: 3,
+		asm:    x86.AVPERMQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLQ256",
 		argLen: 2,
@@ -28912,6 +29774,130 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMPD512",
+		argLen: 2,
+		asm:    x86.AVPERMPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:   "VPERMQ512",
+		argLen: 2,
+		asm:    x86.AVPERMQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2Q512",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2Q,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PD512",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2QMasked512",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2Q,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2PDMasked512",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2PD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMPDMasked512",
+		argLen: 3,
+		asm:    x86.AVPERMPD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMQMasked512",
+		argLen: 3,
+		asm:    x86.AVPERMQ,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPSRLQ512",
 		argLen: 2,
@@ -29092,6 +30078,68 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMB128",
+		argLen: 2,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2B128",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2B,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2BMasked128",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2B,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMBMasked128",
+		argLen: 3,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPMADDUBSW128",
 		argLen: 2,
@@ -29243,6 +30291,68 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMB256",
+		argLen: 2,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2B256",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2B,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2BMasked256",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2B,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMBMasked256",
+		argLen: 3,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPMADDUBSW256",
 		argLen: 2,
@@ -29394,6 +30504,68 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMB512",
+		argLen: 2,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPERMI2B512",
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2B,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:         "VPERMI2BMasked512",
+		argLen:       4,
+		resultInArg0: true,
+		asm:          x86.AVPERMI2B,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:   "VPERMBMasked512",
+		argLen: 3,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+				{1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:   "VPMADDUBSW512",
 		argLen: 2,
@@ -64012,6 +65184,46 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "PermuteInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteUint16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint16x16",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt16x16",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt16x16",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint16x16",
 		argLen:  1,
@@ -64244,6 +65456,46 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "PermuteUint16x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteInt16x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint16x32",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt16x32",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt16x32",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint16x32",
 		argLen:  1,
@@ -64497,6 +65749,46 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "PermuteUint16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint16x8",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt16x8",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint16x8",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint16x8",
 		argLen:  1,
@@ -64739,6 +66031,66 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "PermuteInt32x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteUint32x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteFloat32x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Float32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint32x16",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt32x16",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedFloat32x16",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedFloat32x16",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint32x16",
 		argLen:  1,
@@ -65021,6 +66373,36 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Permute2Uint32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Float32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint32x4",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt32x4",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedFloat32x4",
+		argLen:  4,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint32x4",
 		argLen:  1,
@@ -65303,6 +66685,66 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "PermuteInt32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteUint32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Float32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedFloat32x8",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint32x8",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt32x8",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedFloat32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint32x8",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint32x8",
 		argLen:  1,
@@ -65581,6 +67023,36 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "Permute2Uint64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Float64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint64x2",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt64x2",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedFloat64x2",
+		argLen:  4,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint64x2",
 		argLen:  1,
@@ -65839,6 +67311,66 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "PermuteUint64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteInt64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Float64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt64x4",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint64x4",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedFloat64x4",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedFloat64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint64x4",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint64x4",
 		argLen:  1,
@@ -66097,6 +67629,66 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "PermuteUint64x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteInt64x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteFloat64x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Float64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint64x8",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt64x8",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedFloat64x8",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedFloat64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint64x8",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint64x8",
 		argLen:  1,
@@ -66348,6 +67940,46 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "PermuteUint8x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteInt8x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt8x16",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint8x16",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint8x16",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint8x16",
 		argLen:  1,
@@ -66545,6 +68177,46 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "PermuteUint8x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteInt8x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint8x32",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt8x32",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt8x32",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint8x32",
 		argLen:  1,
@@ -66725,6 +68397,46 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "PermuteUint8x64",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteInt8x64",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "Permute2Int8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2Uint8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedUint8x64",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "Permute2MaskedInt8x64",
+		argLen:  4,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedInt8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "PermuteMaskedUint8x64",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "PopCountUint8x64",
 		argLen:  1,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 6d10b009bb..1aa36bee04 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -3298,6 +3298,276 @@ func rewriteValueAMD64(v *Value) bool {
 		return true
 	case OpPanicBounds:
 		return rewriteValueAMD64_OpPanicBounds(v)
+	case OpPermute2Float32x16:
+		v.Op = OpAMD64VPERMI2PS512
+		return true
+	case OpPermute2Float32x4:
+		v.Op = OpAMD64VPERMI2PS128
+		return true
+	case OpPermute2Float32x8:
+		v.Op = OpAMD64VPERMI2PS256
+		return true
+	case OpPermute2Float64x2:
+		v.Op = OpAMD64VPERMI2PD128
+		return true
+	case OpPermute2Float64x4:
+		v.Op = OpAMD64VPERMI2PD256
+		return true
+	case OpPermute2Float64x8:
+		v.Op = OpAMD64VPERMI2PD512
+		return true
+	case OpPermute2Int16x16:
+		v.Op = OpAMD64VPERMI2W256
+		return true
+	case OpPermute2Int16x32:
+		v.Op = OpAMD64VPERMI2W512
+		return true
+	case OpPermute2Int16x8:
+		v.Op = OpAMD64VPERMI2W128
+		return true
+	case OpPermute2Int32x16:
+		v.Op = OpAMD64VPERMI2D512
+		return true
+	case OpPermute2Int32x4:
+		v.Op = OpAMD64VPERMI2D128
+		return true
+	case OpPermute2Int32x8:
+		v.Op = OpAMD64VPERMI2D256
+		return true
+	case OpPermute2Int64x2:
+		v.Op = OpAMD64VPERMI2Q128
+		return true
+	case OpPermute2Int64x4:
+		v.Op = OpAMD64VPERMI2Q256
+		return true
+	case OpPermute2Int64x8:
+		v.Op = OpAMD64VPERMI2Q512
+		return true
+	case OpPermute2Int8x16:
+		v.Op = OpAMD64VPERMI2B128
+		return true
+	case OpPermute2Int8x32:
+		v.Op = OpAMD64VPERMI2B256
+		return true
+	case OpPermute2Int8x64:
+		v.Op = OpAMD64VPERMI2B512
+		return true
+	case OpPermute2MaskedFloat32x16:
+		return rewriteValueAMD64_OpPermute2MaskedFloat32x16(v)
+	case OpPermute2MaskedFloat32x4:
+		return rewriteValueAMD64_OpPermute2MaskedFloat32x4(v)
+	case OpPermute2MaskedFloat32x8:
+		return rewriteValueAMD64_OpPermute2MaskedFloat32x8(v)
+	case OpPermute2MaskedFloat64x2:
+		return rewriteValueAMD64_OpPermute2MaskedFloat64x2(v)
+	case OpPermute2MaskedFloat64x4:
+		return rewriteValueAMD64_OpPermute2MaskedFloat64x4(v)
+	case OpPermute2MaskedFloat64x8:
+		return rewriteValueAMD64_OpPermute2MaskedFloat64x8(v)
+	case OpPermute2MaskedInt16x16:
+		return rewriteValueAMD64_OpPermute2MaskedInt16x16(v)
+	case OpPermute2MaskedInt16x32:
+		return rewriteValueAMD64_OpPermute2MaskedInt16x32(v)
+	case OpPermute2MaskedInt16x8:
+		return rewriteValueAMD64_OpPermute2MaskedInt16x8(v)
+	case OpPermute2MaskedInt32x16:
+		return rewriteValueAMD64_OpPermute2MaskedInt32x16(v)
+	case OpPermute2MaskedInt32x4:
+		return rewriteValueAMD64_OpPermute2MaskedInt32x4(v)
+	case OpPermute2MaskedInt32x8:
+		return rewriteValueAMD64_OpPermute2MaskedInt32x8(v)
+	case OpPermute2MaskedInt64x2:
+		return rewriteValueAMD64_OpPermute2MaskedInt64x2(v)
+	case OpPermute2MaskedInt64x4:
+		return rewriteValueAMD64_OpPermute2MaskedInt64x4(v)
+	case OpPermute2MaskedInt64x8:
+		return rewriteValueAMD64_OpPermute2MaskedInt64x8(v)
+	case OpPermute2MaskedInt8x16:
+		return rewriteValueAMD64_OpPermute2MaskedInt8x16(v)
+	case OpPermute2MaskedInt8x32:
+		return rewriteValueAMD64_OpPermute2MaskedInt8x32(v)
+	case OpPermute2MaskedInt8x64:
+		return rewriteValueAMD64_OpPermute2MaskedInt8x64(v)
+	case OpPermute2MaskedUint16x16:
+		return rewriteValueAMD64_OpPermute2MaskedUint16x16(v)
+	case OpPermute2MaskedUint16x32:
+		return rewriteValueAMD64_OpPermute2MaskedUint16x32(v)
+	case OpPermute2MaskedUint16x8:
+		return rewriteValueAMD64_OpPermute2MaskedUint16x8(v)
+	case OpPermute2MaskedUint32x16:
+		return rewriteValueAMD64_OpPermute2MaskedUint32x16(v)
+	case OpPermute2MaskedUint32x4:
+		return rewriteValueAMD64_OpPermute2MaskedUint32x4(v)
+	case OpPermute2MaskedUint32x8:
+		return rewriteValueAMD64_OpPermute2MaskedUint32x8(v)
+	case OpPermute2MaskedUint64x2:
+		return rewriteValueAMD64_OpPermute2MaskedUint64x2(v)
+	case OpPermute2MaskedUint64x4:
+		return rewriteValueAMD64_OpPermute2MaskedUint64x4(v)
+	case OpPermute2MaskedUint64x8:
+		return rewriteValueAMD64_OpPermute2MaskedUint64x8(v)
+	case OpPermute2MaskedUint8x16:
+		return rewriteValueAMD64_OpPermute2MaskedUint8x16(v)
+	case OpPermute2MaskedUint8x32:
+		return rewriteValueAMD64_OpPermute2MaskedUint8x32(v)
+	case OpPermute2MaskedUint8x64:
+		return rewriteValueAMD64_OpPermute2MaskedUint8x64(v)
+	case OpPermute2Uint16x16:
+		v.Op = OpAMD64VPERMI2W256
+		return true
+	case OpPermute2Uint16x32:
+		v.Op = OpAMD64VPERMI2W512
+		return true
+	case OpPermute2Uint16x8:
+		v.Op = OpAMD64VPERMI2W128
+		return true
+	case OpPermute2Uint32x16:
+		v.Op = OpAMD64VPERMI2D512
+		return true
+	case OpPermute2Uint32x4:
+		v.Op = OpAMD64VPERMI2D128
+		return true
+	case OpPermute2Uint32x8:
+		v.Op = OpAMD64VPERMI2D256
+		return true
+	case OpPermute2Uint64x2:
+		v.Op = OpAMD64VPERMI2Q128
+		return true
+	case OpPermute2Uint64x4:
+		v.Op = OpAMD64VPERMI2Q256
+		return true
+	case OpPermute2Uint64x8:
+		v.Op = OpAMD64VPERMI2Q512
+		return true
+	case OpPermute2Uint8x16:
+		v.Op = OpAMD64VPERMI2B128
+		return true
+	case OpPermute2Uint8x32:
+		v.Op = OpAMD64VPERMI2B256
+		return true
+	case OpPermute2Uint8x64:
+		v.Op = OpAMD64VPERMI2B512
+		return true
+	case OpPermuteFloat32x16:
+		v.Op = OpAMD64VPERMPS512
+		return true
+	case OpPermuteFloat32x8:
+		v.Op = OpAMD64VPERMPS256
+		return true
+	case OpPermuteFloat64x4:
+		v.Op = OpAMD64VPERMPD256
+		return true
+	case OpPermuteFloat64x8:
+		v.Op = OpAMD64VPERMPD512
+		return true
+	case OpPermuteInt16x16:
+		v.Op = OpAMD64VPERMW256
+		return true
+	case OpPermuteInt16x32:
+		v.Op = OpAMD64VPERMW512
+		return true
+	case OpPermuteInt16x8:
+		v.Op = OpAMD64VPERMW128
+		return true
+	case OpPermuteInt32x16:
+		v.Op = OpAMD64VPERMD512
+		return true
+	case OpPermuteInt32x8:
+		v.Op = OpAMD64VPERMD256
+		return true
+	case OpPermuteInt64x4:
+		v.Op = OpAMD64VPERMQ256
+		return true
+	case OpPermuteInt64x8:
+		v.Op = OpAMD64VPERMQ512
+		return true
+	case OpPermuteInt8x16:
+		v.Op = OpAMD64VPERMB128
+		return true
+	case OpPermuteInt8x32:
+		v.Op = OpAMD64VPERMB256
+		return true
+	case OpPermuteInt8x64:
+		v.Op = OpAMD64VPERMB512
+		return true
+	case OpPermuteMaskedFloat32x16:
+		return rewriteValueAMD64_OpPermuteMaskedFloat32x16(v)
+	case OpPermuteMaskedFloat32x8:
+		return rewriteValueAMD64_OpPermuteMaskedFloat32x8(v)
+	case OpPermuteMaskedFloat64x4:
+		return rewriteValueAMD64_OpPermuteMaskedFloat64x4(v)
+	case OpPermuteMaskedFloat64x8:
+		return rewriteValueAMD64_OpPermuteMaskedFloat64x8(v)
+	case OpPermuteMaskedInt16x16:
+		return rewriteValueAMD64_OpPermuteMaskedInt16x16(v)
+	case OpPermuteMaskedInt16x32:
+		return rewriteValueAMD64_OpPermuteMaskedInt16x32(v)
+	case OpPermuteMaskedInt16x8:
+		return rewriteValueAMD64_OpPermuteMaskedInt16x8(v)
+	case OpPermuteMaskedInt32x16:
+		return rewriteValueAMD64_OpPermuteMaskedInt32x16(v)
+	case OpPermuteMaskedInt32x8:
+		return rewriteValueAMD64_OpPermuteMaskedInt32x8(v)
+	case OpPermuteMaskedInt64x4:
+		return rewriteValueAMD64_OpPermuteMaskedInt64x4(v)
+	case OpPermuteMaskedInt64x8:
+		return rewriteValueAMD64_OpPermuteMaskedInt64x8(v)
+	case OpPermuteMaskedInt8x16:
+		return rewriteValueAMD64_OpPermuteMaskedInt8x16(v)
+	case OpPermuteMaskedInt8x32:
+		return rewriteValueAMD64_OpPermuteMaskedInt8x32(v)
+	case OpPermuteMaskedInt8x64:
+		return rewriteValueAMD64_OpPermuteMaskedInt8x64(v)
+	case OpPermuteMaskedUint16x16:
+		return rewriteValueAMD64_OpPermuteMaskedUint16x16(v)
+	case OpPermuteMaskedUint16x32:
+		return rewriteValueAMD64_OpPermuteMaskedUint16x32(v)
+	case OpPermuteMaskedUint16x8:
+		return rewriteValueAMD64_OpPermuteMaskedUint16x8(v)
+	case OpPermuteMaskedUint32x16:
+		return rewriteValueAMD64_OpPermuteMaskedUint32x16(v)
+	case OpPermuteMaskedUint32x8:
+		return rewriteValueAMD64_OpPermuteMaskedUint32x8(v)
+	case OpPermuteMaskedUint64x4:
+		return rewriteValueAMD64_OpPermuteMaskedUint64x4(v)
+	case OpPermuteMaskedUint64x8:
+		return rewriteValueAMD64_OpPermuteMaskedUint64x8(v)
+	case OpPermuteMaskedUint8x16:
+		return rewriteValueAMD64_OpPermuteMaskedUint8x16(v)
+	case OpPermuteMaskedUint8x32:
+		return rewriteValueAMD64_OpPermuteMaskedUint8x32(v)
+	case OpPermuteMaskedUint8x64:
+		return rewriteValueAMD64_OpPermuteMaskedUint8x64(v)
+	case OpPermuteUint16x16:
+		v.Op = OpAMD64VPERMW256
+		return true
+	case OpPermuteUint16x32:
+		v.Op = OpAMD64VPERMW512
+		return true
+	case OpPermuteUint16x8:
+		v.Op = OpAMD64VPERMW128
+		return true
+	case OpPermuteUint32x16:
+		v.Op = OpAMD64VPERMD512
+		return true
+	case OpPermuteUint32x8:
+		v.Op = OpAMD64VPERMD256
+		return true
+	case OpPermuteUint64x4:
+		v.Op = OpAMD64VPERMQ256
+		return true
+	case OpPermuteUint64x8:
+		v.Op = OpAMD64VPERMQ512
+		return true
+	case OpPermuteUint8x16:
+		v.Op = OpAMD64VPERMB128
+		return true
+	case OpPermuteUint8x32:
+		v.Op = OpAMD64VPERMB256
+		return true
+	case OpPermuteUint8x64:
+		v.Op = OpAMD64VPERMB512
+		return true
 	case OpPopCount16:
 		return rewriteValueAMD64_OpPopCount16(v)
 	case OpPopCount32:
@@ -44315,6 +44585,1038 @@ func rewriteValueAMD64_OpPanicBounds(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueAMD64_OpPermute2MaskedFloat32x16(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedFloat32x16 x y z mask)
+	// result: (VPERMI2PSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2PSMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat32x4(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedFloat32x4 x y z mask)
+	// result: (VPERMI2PSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2PSMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat32x8(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedFloat32x8 x y z mask)
+	// result: (VPERMI2PSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2PSMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat64x2(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedFloat64x2 x y z mask)
+	// result: (VPERMI2PDMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2PDMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat64x4(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedFloat64x4 x y z mask)
+	// result: (VPERMI2PDMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2PDMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat64x8(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedFloat64x8 x y z mask)
+	// result: (VPERMI2PDMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2PDMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt16x16(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt16x16 x y z mask)
+	// result: (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2WMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt16x32(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt16x32 x y z mask)
+	// result: (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2WMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt16x8(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt16x8 x y z mask)
+	// result: (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2WMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt32x16(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt32x16 x y z mask)
+	// result: (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2DMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt32x4(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt32x4 x y z mask)
+	// result: (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2DMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt32x8(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt32x8 x y z mask)
+	// result: (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2DMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt64x2(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt64x2 x y z mask)
+	// result: (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2QMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt64x4(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt64x4 x y z mask)
+	// result: (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2QMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt64x8(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt64x8 x y z mask)
+	// result: (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2QMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt8x16(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt8x16 x y z mask)
+	// result: (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2BMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt8x32(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt8x32 x y z mask)
+	// result: (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2BMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedInt8x64(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedInt8x64 x y z mask)
+	// result: (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2BMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint16x16(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint16x16 x y z mask)
+	// result: (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2WMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint16x32(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint16x32 x y z mask)
+	// result: (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2WMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint16x8(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint16x8 x y z mask)
+	// result: (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2WMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint32x16(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint32x16 x y z mask)
+	// result: (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2DMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint32x4(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint32x4 x y z mask)
+	// result: (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2DMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint32x8(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint32x8 x y z mask)
+	// result: (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2DMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint64x2(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint64x2 x y z mask)
+	// result: (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2QMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint64x4(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint64x4 x y z mask)
+	// result: (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2QMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint64x8(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint64x8 x y z mask)
+	// result: (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2QMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint8x16(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint8x16 x y z mask)
+	// result: (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2BMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint8x32(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint8x32 x y z mask)
+	// result: (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2BMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermute2MaskedUint8x64(v *Value) bool {
+	v_3 := v.Args[3]
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (Permute2MaskedUint8x64 x y z mask)
+	// result: (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		z := v_2
+		mask := v_3
+		v.reset(OpAMD64VPERMI2BMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg4(x, y, z, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedFloat32x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedFloat32x16 x y mask)
+	// result: (VPERMPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMPSMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedFloat32x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedFloat32x8 x y mask)
+	// result: (VPERMPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMPSMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedFloat64x4(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedFloat64x4 x y mask)
+	// result: (VPERMPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMPDMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedFloat64x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedFloat64x8 x y mask)
+	// result: (VPERMPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMPDMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt16x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt16x16 x y mask)
+	// result: (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMWMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt16x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt16x32 x y mask)
+	// result: (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMWMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt16x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt16x8 x y mask)
+	// result: (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMWMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt32x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt32x16 x y mask)
+	// result: (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMDMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt32x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt32x8 x y mask)
+	// result: (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMDMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt64x4(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt64x4 x y mask)
+	// result: (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMQMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt64x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt64x8 x y mask)
+	// result: (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMQMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt8x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt8x16 x y mask)
+	// result: (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMBMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt8x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt8x32 x y mask)
+	// result: (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMBMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedInt8x64(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedInt8x64 x y mask)
+	// result: (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMBMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint16x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint16x16 x y mask)
+	// result: (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMWMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint16x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint16x32 x y mask)
+	// result: (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMWMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint16x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint16x8 x y mask)
+	// result: (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMWMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint32x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint32x16 x y mask)
+	// result: (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMDMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint32x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint32x8 x y mask)
+	// result: (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMDMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint64x4(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint64x4 x y mask)
+	// result: (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMQMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint64x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint64x8 x y mask)
+	// result: (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMQMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint8x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint8x16 x y mask)
+	// result: (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMBMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint8x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint8x32 x y mask)
+	// result: (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMBMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpPermuteMaskedUint8x64(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (PermuteMaskedUint8x64 x y mask)
+	// result: (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPERMBMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpPopCount16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
index c47b089815..fd7ebb20a3 100644
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1622,18 +1622,42 @@ func opLen2(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa
 	}
 }
 
+func opLen2_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		return s.newValue2(op, t, args[1], args[0])
+	}
+}
+
 func opLen3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		return s.newValue3(op, t, args[0], args[1], args[2])
 	}
 }
 
+func opLen3_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		return s.newValue3(op, t, args[1], args[0], args[2])
+	}
+}
+
+func opLen3_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		return s.newValue3(op, t, args[2], args[0], args[1])
+	}
+}
+
 func opLen4(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		return s.newValue4(op, t, args[0], args[1], args[2], args[3])
 	}
 }
 
+func opLen4_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		return s.newValue4(op, t, args[2], args[0], args[1], args[3])
+	}
+}
+
 func plainPanicSimdImm(s *state) {
 	cmp := s.newValue0(ssa.OpConstBool, types.Types[types.TBOOL])
 	cmp.AuxInt = 0
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 58bc420fc4..3805ca35a8 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -996,6 +996,114 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x64.Permute", opLen2_21(ssa.OpPermuteUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.Permute", opLen2_21(ssa.OpPermuteInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.Permute", opLen2_21(ssa.OpPermuteUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.Permute", opLen2_21(ssa.OpPermuteInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.Permute", opLen2_21(ssa.OpPermuteUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.Permute", opLen2_21(ssa.OpPermuteInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x32.Permute", opLen2_21(ssa.OpPermuteUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x8.Permute", opLen2_21(ssa.OpPermuteFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.Permute", opLen2_21(ssa.OpPermuteInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.Permute", opLen2_21(ssa.OpPermuteUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.Permute", opLen2_21(ssa.OpPermuteFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x16.Permute", opLen2_21(ssa.OpPermuteInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint32x16.Permute", opLen2_21(ssa.OpPermuteUint32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x4.Permute", opLen2_21(ssa.OpPermuteFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x4.Permute", opLen2_21(ssa.OpPermuteInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint64x4.Permute", opLen2_21(ssa.OpPermuteUint64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x64.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x64.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.PopCount", opLen1(ssa.OpPopCountInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.PopCount", opLen1(ssa.OpPopCountInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.PopCount", opLen1(ssa.OpPopCountInt8x64, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go
index 7a8780e5cb..29899f8cb1 100644
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -5391,6 +5391,830 @@ func (x Uint32x4) PairwiseSub(y Uint32x4) Uint32x4
 // Asm: VPHSUBD, CPU Feature: AVX2
 func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8
 
+/* Permute */
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute(indices Uint8x16) Int8x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x32) Permute(indices Uint8x32) Int8x32
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x64) Permute(indices Uint8x64) Int8x64
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x8) Permute(indices Uint16x8) Int16x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x16) Permute(indices Uint16x16) Int16x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x32) Permute(indices Uint16x32) Int16x32
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPS, CPU Feature: AVX2
+func (x Float32x8) Permute(indices Uint32x8) Float32x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX2
+func (x Int32x8) Permute(indices Uint32x8) Int32x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX2
+func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPS, CPU Feature: AVX512F
+func (x Float32x16) Permute(indices Uint32x16) Float32x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Int32x16) Permute(indices Uint32x16) Int32x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPD, CPU Feature: AVX512F
+func (x Float64x4) Permute(indices Uint64x4) Float64x4
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Int64x4) Permute(indices Uint64x4) Int64x4
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPD, CPU Feature: AVX512F
+func (x Float64x8) Permute(indices Uint64x8) Float64x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Int64x8) Permute(indices Uint64x8) Int64x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
+
+/* Permute2 */
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute2(y Int8x16, indices Uint8x16) Int8x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x16) Permute2(y Uint8x16, indices Uint8x16) Uint8x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x32) Permute2(y Int8x32, indices Uint8x32) Int8x32
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x32) Permute2(y Uint8x32, indices Uint8x32) Uint8x32
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x64) Permute2(y Int8x64, indices Uint8x64) Int8x64
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x64) Permute2(y Uint8x64, indices Uint8x64) Uint8x64
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x8) Permute2(y Int16x8, indices Uint16x8) Int16x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x8) Permute2(y Uint16x8, indices Uint16x8) Uint16x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x16) Permute2(y Int16x16, indices Uint16x16) Int16x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x16) Permute2(y Uint16x16, indices Uint16x16) Uint16x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x32) Permute2(y Int16x32, indices Uint16x32) Int16x32
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x32) Permute2(y Uint16x32, indices Uint16x32) Uint16x32
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x4) Permute2(y Float32x4, indices Uint32x4) Float32x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x4) Permute2(y Int32x4, indices Uint32x4) Int32x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x4) Permute2(y Uint32x4, indices Uint32x4) Uint32x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x8) Permute2(y Float32x8, indices Uint32x8) Float32x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x8) Permute2(y Int32x8, indices Uint32x8) Int32x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x8) Permute2(y Uint32x8, indices Uint32x8) Uint32x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x16) Permute2(y Float32x16, indices Uint32x16) Float32x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x16) Permute2(y Int32x16, indices Uint32x16) Int32x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x16) Permute2(y Uint32x16, indices Uint32x16) Uint32x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x2) Permute2(y Float64x2, indices Uint64x2) Float64x2
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x2) Permute2(y Int64x2, indices Uint64x2) Int64x2
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x2) Permute2(y Uint64x2, indices Uint64x2) Uint64x2
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x4) Permute2(y Float64x4, indices Uint64x4) Float64x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x4) Permute2(y Int64x4, indices Uint64x4) Int64x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x4) Permute2(y Uint64x4, indices Uint64x4) Uint64x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x8) Permute2(y Float64x8, indices Uint64x8) Float64x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
+
+/* Permute2Masked */
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute2Masked(y Int8x16, indices Uint8x16, u Mask8x16) Int8x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x16) Permute2Masked(y Uint8x16, indices Uint8x16, u Mask8x16) Uint8x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x32) Permute2Masked(y Int8x32, indices Uint8x32, u Mask8x32) Int8x32
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x32) Permute2Masked(y Uint8x32, indices Uint8x32, u Mask8x32) Uint8x32
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x64) Permute2Masked(y Int8x64, indices Uint8x64, u Mask8x64) Int8x64
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x64) Permute2Masked(y Uint8x64, indices Uint8x64, u Mask8x64) Uint8x64
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x8) Permute2Masked(y Int16x8, indices Uint16x8, u Mask16x8) Int16x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x8) Permute2Masked(y Uint16x8, indices Uint16x8, u Mask16x8) Uint16x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x16) Permute2Masked(y Int16x16, indices Uint16x16, u Mask16x16) Int16x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x16) Permute2Masked(y Uint16x16, indices Uint16x16, u Mask16x16) Uint16x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x32) Permute2Masked(y Int16x32, indices Uint16x32, u Mask16x32) Int16x32
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x32) Permute2Masked(y Uint16x32, indices Uint16x32, u Mask16x32) Uint16x32
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x4) Permute2Masked(y Float32x4, indices Uint32x4, u Mask32x4) Float32x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x4) Permute2Masked(y Int32x4, indices Uint32x4, u Mask32x4) Int32x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x4) Permute2Masked(y Uint32x4, indices Uint32x4, u Mask32x4) Uint32x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x8) Permute2Masked(y Float32x8, indices Uint32x8, u Mask32x8) Float32x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x8) Permute2Masked(y Int32x8, indices Uint32x8, u Mask32x8) Int32x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x8) Permute2Masked(y Uint32x8, indices Uint32x8, u Mask32x8) Uint32x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x16) Permute2Masked(y Float32x16, indices Uint32x16, u Mask32x16) Float32x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x16) Permute2Masked(y Int32x16, indices Uint32x16, u Mask32x16) Int32x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x16) Permute2Masked(y Uint32x16, indices Uint32x16, u Mask32x16) Uint32x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x2) Permute2Masked(y Float64x2, indices Uint64x2, u Mask64x2) Float64x2
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x2) Permute2Masked(y Int64x2, indices Uint64x2, u Mask64x2) Int64x2
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x2) Permute2Masked(y Uint64x2, indices Uint64x2, u Mask64x2) Uint64x2
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x4) Permute2Masked(y Float64x4, indices Uint64x4, u Mask64x4) Float64x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x4) Permute2Masked(y Int64x4, indices Uint64x4, u Mask64x4) Int64x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x4) Permute2Masked(y Uint64x4, indices Uint64x4, u Mask64x4) Uint64x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x8) Permute2Masked(y Float64x8, indices Uint64x8, u Mask64x8) Float64x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x8) Permute2Masked(y Int64x8, indices Uint64x8, u Mask64x8) Int64x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, u Mask64x8) Uint64x8
+
+/* PermuteMasked */
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x16) PermuteMasked(indices Uint8x16, z Mask8x16) Int8x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x16) PermuteMasked(indices Uint8x16, z Mask8x16) Uint8x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x32) PermuteMasked(indices Uint8x32, z Mask8x32) Int8x32
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x32) PermuteMasked(indices Uint8x32, z Mask8x32) Uint8x32
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x64) PermuteMasked(indices Uint8x64, z Mask8x64) Int8x64
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x64) PermuteMasked(indices Uint8x64, z Mask8x64) Uint8x64
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x8) PermuteMasked(indices Uint16x8, z Mask16x8) Int16x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x8) PermuteMasked(indices Uint16x8, z Mask16x8) Uint16x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x16) PermuteMasked(indices Uint16x16, z Mask16x16) Int16x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x16) PermuteMasked(indices Uint16x16, z Mask16x16) Uint16x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x32) PermuteMasked(indices Uint16x32, z Mask16x32) Int16x32
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x32) PermuteMasked(indices Uint16x32, z Mask16x32) Uint16x32
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPS, CPU Feature: AVX512F
+func (x Float32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Float32x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Int32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Int32x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Uint32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Uint32x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPS, CPU Feature: AVX512F
+func (x Float32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Float32x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Int32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Int32x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Uint32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Uint32x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPD, CPU Feature: AVX512F
+func (x Float64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Float64x4
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Int64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Int64x4
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Uint64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Uint64x4
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPD, CPU Feature: AVX512F
+func (x Float64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Float64x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Int64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Int64x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Uint64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Uint64x8
+
 /* PopCount */
 
 // PopCount counts the number of set bits in each element.
diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go
index 36923319ff..f1a2f11738 100644
--- a/src/simd/simd_test.go
+++ b/src/simd/simd_test.go
@@ -151,6 +151,41 @@ func TestMaskedAdd(t *testing.T) {
 	testInt32x4BinaryMasked(t, []int32{1, 2, 3, 4}, []int32{5, 6, 7, 8}, []int32{-1, -1, 0, 0}, []int32{6, 8, 0, 0}, "AddMasked")
 }
 
+func TestPermute(t *testing.T) {
+	if !simd.HasAVX512() {
+		t.Skip("Test requires HasAVX512, not available on this hardware")
+		return
+	}
+	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
+	indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
+	want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
+	got := make([]int64, 8)
+	simd.LoadInt64x8Slice(x).Permute(simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermute2(t *testing.T) {
+	if !simd.HasAVX512() {
+		t.Skip("Test requires HasAVX512, not available on this hardware")
+		return
+	}
+	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
+	y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
+	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
+	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
+	got := make([]int64, 8)
+	simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
 // checkInt8Slices ensures that b and a are equal, to the end of b.
 // also serves to use the slices, to prevent accidental optimization.
 func checkInt8Slices(t *testing.T, a, b []int8) {
diff --git a/src/simd/simd_wrapped_test.go b/src/simd/simd_wrapped_test.go
index 6466684068..29452bdad0 100644
--- a/src/simd/simd_wrapped_test.go
+++ b/src/simd/simd_wrapped_test.go
@@ -7800,6 +7800,10 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6
 // GaloisFieldAffineTransformMasked
 // Get128
 // GetElem
+// Permute
+// Permute2
+// Permute2Masked
+// PermuteMasked
 // RotateAllLeft
 // RotateAllLeftMasked
 // RotateAllRight