[dev.simd] cmd/compile, simd: add variable Permute

author Junyang Shao <shaojunyang@google.com>

Mon, 14 Jul 2025 19:39:44 +0000 (19:39 +0000)

committer Junyang Shao <shaojunyang@google.com>

Tue, 15 Jul 2025 21:53:57 +0000 (14:53 -0700)
author Junyang Shao <shaojunyang@google.com>
Mon, 14 Jul 2025 19:39:44 +0000 (19:39 +0000)
committer Junyang Shao <shaojunyang@google.com>
Tue, 15 Jul 2025 21:53:57 +0000 (14:53 -0700)
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go

index 0ebb955accda3f98f8e2e79e0cabeb514f86f9fb..1a7e3be9e50d279fcebefb6928a038ac7978a4e4 100644 (file)
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -233,6 +233,20 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPHSUBW256,
                 ssa.OpAMD64VPHSUBD128,
                 ssa.OpAMD64VPHSUBD256,
+               ssa.OpAMD64VPERMB128,
+               ssa.OpAMD64VPERMB256,
+               ssa.OpAMD64VPERMB512,
+               ssa.OpAMD64VPERMW128,
+               ssa.OpAMD64VPERMW256,
+               ssa.OpAMD64VPERMW512,
+               ssa.OpAMD64VPERMPS256,
+               ssa.OpAMD64VPERMD256,
+               ssa.OpAMD64VPERMPS512,
+               ssa.OpAMD64VPERMD512,
+               ssa.OpAMD64VPERMPD256,
+               ssa.OpAMD64VPERMQ256,
+               ssa.OpAMD64VPERMPD512,
+               ssa.OpAMD64VPERMQ512,
                 ssa.OpAMD64VPROLVD128,
                 ssa.OpAMD64VPROLVD256,
                 ssa.OpAMD64VPROLVD512,
@@ -468,6 +482,20 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMADDWDMasked128,
                 ssa.OpAMD64VPMADDWDMasked256,
                 ssa.OpAMD64VPMADDWDMasked512,
+               ssa.OpAMD64VPERMBMasked128,
+               ssa.OpAMD64VPERMBMasked256,
+               ssa.OpAMD64VPERMBMasked512,
+               ssa.OpAMD64VPERMWMasked128,
+               ssa.OpAMD64VPERMWMasked256,
+               ssa.OpAMD64VPERMWMasked512,
+               ssa.OpAMD64VPERMPSMasked256,
+               ssa.OpAMD64VPERMDMasked256,
+               ssa.OpAMD64VPERMPSMasked512,
+               ssa.OpAMD64VPERMDMasked512,
+               ssa.OpAMD64VPERMPDMasked256,
+               ssa.OpAMD64VPERMQMasked256,
+               ssa.OpAMD64VPERMPDMasked512,
+               ssa.OpAMD64VPERMQMasked512,
                 ssa.OpAMD64VPROLVDMasked128,
                 ssa.OpAMD64VPROLVDMasked256,
                 ssa.OpAMD64VPROLVDMasked512,
@@ -766,6 +794,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPDPWSSD128,
                 ssa.OpAMD64VPDPWSSD256,
                 ssa.OpAMD64VPDPWSSD512,
+               ssa.OpAMD64VPERMI2B128,
+               ssa.OpAMD64VPERMI2B256,
+               ssa.OpAMD64VPERMI2B512,
+               ssa.OpAMD64VPERMI2W128,
+               ssa.OpAMD64VPERMI2W256,
+               ssa.OpAMD64VPERMI2W512,
+               ssa.OpAMD64VPERMI2PS128,
+               ssa.OpAMD64VPERMI2D128,
+               ssa.OpAMD64VPERMI2PS256,
+               ssa.OpAMD64VPERMI2D256,
+               ssa.OpAMD64VPERMI2PS512,
+               ssa.OpAMD64VPERMI2D512,
+               ssa.OpAMD64VPERMI2PD128,
+               ssa.OpAMD64VPERMI2Q128,
+               ssa.OpAMD64VPERMI2PD256,
+               ssa.OpAMD64VPERMI2Q256,
+               ssa.OpAMD64VPERMI2PD512,
+               ssa.OpAMD64VPERMI2Q512,
                 ssa.OpAMD64VPDPWSSDS128,
                 ssa.OpAMD64VPDPWSSDS256,
                 ssa.OpAMD64VPDPWSSDS512,
@@ -816,6 +862,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPDPWSSDMasked128,
                 ssa.OpAMD64VPDPWSSDMasked256,
                 ssa.OpAMD64VPDPWSSDMasked512,
+               ssa.OpAMD64VPERMI2BMasked128,
+               ssa.OpAMD64VPERMI2BMasked256,
+               ssa.OpAMD64VPERMI2BMasked512,
+               ssa.OpAMD64VPERMI2WMasked128,
+               ssa.OpAMD64VPERMI2WMasked256,
+               ssa.OpAMD64VPERMI2WMasked512,
+               ssa.OpAMD64VPERMI2PSMasked128,
+               ssa.OpAMD64VPERMI2DMasked128,
+               ssa.OpAMD64VPERMI2PSMasked256,
+               ssa.OpAMD64VPERMI2DMasked256,
+               ssa.OpAMD64VPERMI2PSMasked512,
+               ssa.OpAMD64VPERMI2DMasked512,
+               ssa.OpAMD64VPERMI2PDMasked128,
+               ssa.OpAMD64VPERMI2QMasked128,
+               ssa.OpAMD64VPERMI2PDMasked256,
+               ssa.OpAMD64VPERMI2QMasked256,
+               ssa.OpAMD64VPERMI2PDMasked512,
+               ssa.OpAMD64VPERMI2QMasked512,
                 ssa.OpAMD64VPDPWSSDSMasked128,
                 ssa.OpAMD64VPDPWSSDSMasked256,
                 ssa.OpAMD64VPDPWSSDSMasked512,
@@ -1158,6 +1222,38 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMADDWDMasked128,
                 ssa.OpAMD64VPMADDWDMasked256,
                 ssa.OpAMD64VPMADDWDMasked512,
+               ssa.OpAMD64VPERMI2BMasked128,
+               ssa.OpAMD64VPERMI2BMasked256,
+               ssa.OpAMD64VPERMI2BMasked512,
+               ssa.OpAMD64VPERMI2WMasked128,
+               ssa.OpAMD64VPERMI2WMasked256,
+               ssa.OpAMD64VPERMI2WMasked512,
+               ssa.OpAMD64VPERMI2PSMasked128,
+               ssa.OpAMD64VPERMI2DMasked128,
+               ssa.OpAMD64VPERMI2PSMasked256,
+               ssa.OpAMD64VPERMI2DMasked256,
+               ssa.OpAMD64VPERMI2PSMasked512,
+               ssa.OpAMD64VPERMI2DMasked512,
+               ssa.OpAMD64VPERMI2PDMasked128,
+               ssa.OpAMD64VPERMI2QMasked128,
+               ssa.OpAMD64VPERMI2PDMasked256,
+               ssa.OpAMD64VPERMI2QMasked256,
+               ssa.OpAMD64VPERMI2PDMasked512,
+               ssa.OpAMD64VPERMI2QMasked512,
+               ssa.OpAMD64VPERMBMasked128,
+               ssa.OpAMD64VPERMBMasked256,
+               ssa.OpAMD64VPERMBMasked512,
+               ssa.OpAMD64VPERMWMasked128,
+               ssa.OpAMD64VPERMWMasked256,
+               ssa.OpAMD64VPERMWMasked512,
+               ssa.OpAMD64VPERMPSMasked256,
+               ssa.OpAMD64VPERMDMasked256,
+               ssa.OpAMD64VPERMPSMasked512,
+               ssa.OpAMD64VPERMDMasked512,
+               ssa.OpAMD64VPERMPDMasked256,
+               ssa.OpAMD64VPERMQMasked256,
+               ssa.OpAMD64VPERMPDMasked512,
+               ssa.OpAMD64VPERMQMasked512,
                 ssa.OpAMD64VPOPCNTBMasked128,
                 ssa.OpAMD64VPOPCNTBMasked256,
                 ssa.OpAMD64VPOPCNTBMasked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index 0cbca8bf72491b5c77910477504d707e6ed59bc1..5898406e9d351cc752ca82367abe379d598e2c22 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -985,6 +985,114 @@
  (PairwiseSubUint16x16 ...) => (VPHSUBW256 ...)
  (PairwiseSubUint32x4 ...) => (VPHSUBD128 ...)
  (PairwiseSubUint32x8 ...) => (VPHSUBD256 ...)
+(PermuteFloat32x8 ...) => (VPERMPS256 ...)
+(PermuteFloat32x16 ...) => (VPERMPS512 ...)
+(PermuteFloat64x4 ...) => (VPERMPD256 ...)
+(PermuteFloat64x8 ...) => (VPERMPD512 ...)
+(PermuteInt8x16 ...) => (VPERMB128 ...)
+(PermuteInt8x32 ...) => (VPERMB256 ...)
+(PermuteInt8x64 ...) => (VPERMB512 ...)
+(PermuteInt16x8 ...) => (VPERMW128 ...)
+(PermuteInt16x16 ...) => (VPERMW256 ...)
+(PermuteInt16x32 ...) => (VPERMW512 ...)
+(PermuteInt32x8 ...) => (VPERMD256 ...)
+(PermuteInt32x16 ...) => (VPERMD512 ...)
+(PermuteInt64x4 ...) => (VPERMQ256 ...)
+(PermuteInt64x8 ...) => (VPERMQ512 ...)
+(PermuteUint8x16 ...) => (VPERMB128 ...)
+(PermuteUint8x32 ...) => (VPERMB256 ...)
+(PermuteUint8x64 ...) => (VPERMB512 ...)
+(PermuteUint16x8 ...) => (VPERMW128 ...)
+(PermuteUint16x16 ...) => (VPERMW256 ...)
+(PermuteUint16x32 ...) => (VPERMW512 ...)
+(PermuteUint32x8 ...) => (VPERMD256 ...)
+(PermuteUint32x16 ...) => (VPERMD512 ...)
+(PermuteUint64x4 ...) => (VPERMQ256 ...)
+(PermuteUint64x8 ...) => (VPERMQ512 ...)
+(Permute2Float32x4 ...) => (VPERMI2PS128 ...)
+(Permute2Float32x8 ...) => (VPERMI2PS256 ...)
+(Permute2Float32x16 ...) => (VPERMI2PS512 ...)
+(Permute2Float64x2 ...) => (VPERMI2PD128 ...)
+(Permute2Float64x4 ...) => (VPERMI2PD256 ...)
+(Permute2Float64x8 ...) => (VPERMI2PD512 ...)
+(Permute2Int8x16 ...) => (VPERMI2B128 ...)
+(Permute2Int8x32 ...) => (VPERMI2B256 ...)
+(Permute2Int8x64 ...) => (VPERMI2B512 ...)
+(Permute2Int16x8 ...) => (VPERMI2W128 ...)
+(Permute2Int16x16 ...) => (VPERMI2W256 ...)
+(Permute2Int16x32 ...) => (VPERMI2W512 ...)
+(Permute2Int32x4 ...) => (VPERMI2D128 ...)
+(Permute2Int32x8 ...) => (VPERMI2D256 ...)
+(Permute2Int32x16 ...) => (VPERMI2D512 ...)
+(Permute2Int64x2 ...) => (VPERMI2Q128 ...)
+(Permute2Int64x4 ...) => (VPERMI2Q256 ...)
+(Permute2Int64x8 ...) => (VPERMI2Q512 ...)
+(Permute2Uint8x16 ...) => (VPERMI2B128 ...)
+(Permute2Uint8x32 ...) => (VPERMI2B256 ...)
+(Permute2Uint8x64 ...) => (VPERMI2B512 ...)
+(Permute2Uint16x8 ...) => (VPERMI2W128 ...)
+(Permute2Uint16x16 ...) => (VPERMI2W256 ...)
+(Permute2Uint16x32 ...) => (VPERMI2W512 ...)
+(Permute2Uint32x4 ...) => (VPERMI2D128 ...)
+(Permute2Uint32x8 ...) => (VPERMI2D256 ...)
+(Permute2Uint32x16 ...) => (VPERMI2D512 ...)
+(Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
+(Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
+(Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
+(Permute2MaskedFloat32x4 x y z mask) => (VPERMI2PSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Permute2MaskedFloat32x8 x y z mask) => (VPERMI2PSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(Permute2MaskedFloat32x16 x y z mask) => (VPERMI2PSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(Permute2MaskedFloat64x2 x y z mask) => (VPERMI2PDMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Permute2MaskedFloat64x4 x y z mask) => (VPERMI2PDMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+(Permute2MaskedFloat64x8 x y z mask) => (VPERMI2PDMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+(Permute2MaskedInt8x16 x y z mask) => (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Permute2MaskedInt8x32 x y z mask) => (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
+(Permute2MaskedInt8x64 x y z mask) => (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
+(Permute2MaskedInt16x8 x y z mask) => (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Permute2MaskedInt16x16 x y z mask) => (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
+(Permute2MaskedInt16x32 x y z mask) => (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
+(Permute2MaskedInt32x4 x y z mask) => (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Permute2MaskedInt32x8 x y z mask) => (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(Permute2MaskedInt32x16 x y z mask) => (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(Permute2MaskedInt64x2 x y z mask) => (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Permute2MaskedInt64x4 x y z mask) => (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+(Permute2MaskedInt64x8 x y z mask) => (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+(Permute2MaskedUint8x16 x y z mask) => (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
+(Permute2MaskedUint8x32 x y z mask) => (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
+(Permute2MaskedUint8x64 x y z mask) => (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
+(Permute2MaskedUint16x8 x y z mask) => (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
+(Permute2MaskedUint16x16 x y z mask) => (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
+(Permute2MaskedUint16x32 x y z mask) => (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
+(Permute2MaskedUint32x4 x y z mask) => (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+(Permute2MaskedUint32x8 x y z mask) => (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+(Permute2MaskedUint32x16 x y z mask) => (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(Permute2MaskedUint64x2 x y z mask) => (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+(Permute2MaskedUint64x4 x y z mask) => (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+(Permute2MaskedUint64x8 x y z mask) => (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+(PermuteMaskedFloat32x8 x y mask) => (VPERMPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(PermuteMaskedFloat32x16 x y mask) => (VPERMPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(PermuteMaskedFloat64x4 x y mask) => (VPERMPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(PermuteMaskedFloat64x8 x y mask) => (VPERMPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(PermuteMaskedInt8x16 x y mask) => (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(PermuteMaskedInt8x32 x y mask) => (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(PermuteMaskedInt8x64 x y mask) => (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(PermuteMaskedInt16x8 x y mask) => (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(PermuteMaskedInt16x16 x y mask) => (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(PermuteMaskedInt16x32 x y mask) => (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(PermuteMaskedInt32x8 x y mask) => (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(PermuteMaskedInt32x16 x y mask) => (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(PermuteMaskedInt64x4 x y mask) => (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(PermuteMaskedInt64x8 x y mask) => (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(PermuteMaskedUint8x16 x y mask) => (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(PermuteMaskedUint8x32 x y mask) => (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(PermuteMaskedUint8x64 x y mask) => (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(PermuteMaskedUint16x8 x y mask) => (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(PermuteMaskedUint16x16 x y mask) => (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(PermuteMaskedUint16x32 x y mask) => (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(PermuteMaskedUint32x8 x y mask) => (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(PermuteMaskedUint32x16 x y mask) => (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(PermuteMaskedUint64x4 x y mask) => (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(PermuteMaskedUint64x8 x y mask) => (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
  (PopCountInt8x16 ...) => (VPOPCNTB128 ...)
  (PopCountInt8x32 ...) => (VPOPCNTB256 ...)
  (PopCountInt8x64 ...) => (VPOPCNTB512 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go

index 6985daa04bcaaa36740d3e6b066c9f06cec8f778..19ac0b0dea658359692cacde09db2c0e99a9c14d 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -613,6 +613,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMINUWMasked256", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec256", resultInArg0: false},
                 {name: "VPMULHUW256", argLength: 2, reg: v21, asm: "VPMULHUW", commutative: true, typ: "Vec256", resultInArg0: false},
                 {name: "VPMULHUWMasked256", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMW256", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMI2W256", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMI2WMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMWMasked256", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLW256", argLength: 2, reg: vfpv, asm: "VPSRLW", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLWMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLVW256", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -625,6 +629,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMINUWMasked512", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec512", resultInArg0: false},
                 {name: "VPMULHUW512", argLength: 2, reg: w21, asm: "VPMULHUW", commutative: true, typ: "Vec512", resultInArg0: false},
                 {name: "VPMULHUWMasked512", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMW512", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMI2W512", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMI2WMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMWMasked512", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLW512", argLength: 2, reg: wfpw, asm: "VPSRLW", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLWMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLVW512", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -637,6 +645,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMINUWMasked128", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec128", resultInArg0: false},
                 {name: "VPMULHUW128", argLength: 2, reg: v21, asm: "VPMULHUW", commutative: true, typ: "Vec128", resultInArg0: false},
                 {name: "VPMULHUWMasked128", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec128", resultInArg0: false},
+               {name: "VPERMW128", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPERMI2W128", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMI2WMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMWMasked128", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSRLW128", argLength: 2, reg: vfpv, asm: "VPSRLW", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSRLWMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSRLVW128", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -645,6 +657,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMAXUDMasked512", argLength: 3, reg: w2kw, asm: "VPMAXUD", commutative: true, typ: "Vec512", resultInArg0: false},
                 {name: "VPMINUD512", argLength: 2, reg: w21, asm: "VPMINUD", commutative: true, typ: "Vec512", resultInArg0: false},
                 {name: "VPMINUDMasked512", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMPS512", argLength: 2, reg: w21, asm: "VPERMPS", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMD512", argLength: 2, reg: w21, asm: "VPERMD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMI2D512", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMI2PS512", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMI2DMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMI2PSMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMPSMasked512", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMDMasked512", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLD512", argLength: 2, reg: wfpw, asm: "VPSRLD", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLDMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLVD512", argLength: 2, reg: w21, asm: "VPSRLVD", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -654,6 +674,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMINUD128", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
                 {name: "VPMINUDMasked128", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
                 {name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
+               {name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMI2PS128", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMI2DMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
                 {name: "VPSRLD128", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSRLDMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSRLVD128", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -663,6 +687,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMINUD256", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
                 {name: "VPMINUDMasked256", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
                 {name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMI2PS256", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMI2PSMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMPSMasked256", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMDMasked256", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLD256", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLDMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLVD256", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -672,6 +704,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMINUQ128", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec128", resultInArg0: false},
                 {name: "VPMINUQMasked128", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec128", resultInArg0: false},
                 {name: "VPMULUDQMasked128", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
+               {name: "VPERMI2PD128", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMI2Q128", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMI2QMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMI2PDMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec128", resultInArg0: true},
                 {name: "VPSRLQ128", argLength: 2, reg: vfpv, asm: "VPSRLQ", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSRLQMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSRLVQ128", argLength: 2, reg: v21, asm: "VPSRLVQ", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -681,6 +717,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMINUQ256", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
                 {name: "VPMINUQMasked256", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
                 {name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMPD256", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMI2PD256", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMI2PDMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMI2QMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMPDMasked256", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMQMasked256", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLQ256", argLength: 2, reg: vfpv, asm: "VPSRLQ", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLQMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSRLVQ256", argLength: 2, reg: v21, asm: "VPSRLVQ", commutative: false, typ: "Vec256", resultInArg0: false},
@@ -691,6 +735,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMINUQMasked512", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec512", resultInArg0: false},
                 {name: "VPMULUDQ512", argLength: 2, reg: w21, asm: "VPMULUDQ", commutative: true, typ: "Vec512", resultInArg0: false},
                 {name: "VPMULUDQMasked512", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMPD512", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMQ512", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMI2Q512", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMI2PD512", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMI2QMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMI2PDMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMPDMasked512", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLQ512", argLength: 2, reg: wfpw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLQMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSRLVQ512", argLength: 2, reg: w21, asm: "VPSRLVQ", commutative: false, typ: "Vec512", resultInArg0: false},
@@ -703,6 +755,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMAXUBMasked128", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec128", resultInArg0: false},
                 {name: "VPMINUB128", argLength: 2, reg: v21, asm: "VPMINUB", commutative: true, typ: "Vec128", resultInArg0: false},
                 {name: "VPMINUBMasked128", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec128", resultInArg0: false},
+               {name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPERMI2B128", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMI2BMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec128", resultInArg0: true},
+               {name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPMADDUBSW128", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPMADDUBSWMasked128", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPAVGB256", argLength: 2, reg: v21, asm: "VPAVGB", commutative: true, typ: "Vec256", resultInArg0: false},
@@ -713,6 +769,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMAXUBMasked256", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec256", resultInArg0: false},
                 {name: "VPMINUB256", argLength: 2, reg: v21, asm: "VPMINUB", commutative: true, typ: "Vec256", resultInArg0: false},
                 {name: "VPMINUBMasked256", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPERMI2B256", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMI2BMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec256", resultInArg0: true},
+               {name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPMADDUBSW256", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPMADDUBSWMasked256", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPAVGB512", argLength: 2, reg: w21, asm: "VPAVGB", commutative: true, typ: "Vec512", resultInArg0: false},
@@ -723,6 +783,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPMAXUBMasked512", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec512", resultInArg0: false},
                 {name: "VPMINUB512", argLength: 2, reg: w21, asm: "VPMINUB", commutative: true, typ: "Vec512", resultInArg0: false},
                 {name: "VPMINUBMasked512", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPERMI2B512", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMI2BMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec512", resultInArg0: true},
+               {name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPMADDUBSW512", argLength: 2, reg: w21, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPMADDUBSWMasked512", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VRNDSCALEPS512", argLength: 1, reg: w11, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index a1dfc1e7da7a6bd5c760dfb447cb02fa69c9e441..dd27d0cc9411ddbc0ffda8ee1f0ad8b1e0ceb0f1 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -889,6 +889,14 @@ func simdGenericOps() []opData {
                 {name: "OrUint16x16", argLength: 2, commutative: true},
                 {name: "PairwiseAddUint16x16", argLength: 2, commutative: false},
                 {name: "PairwiseSubUint16x16", argLength: 2, commutative: false},
+               {name: "PermuteInt16x16", argLength: 2, commutative: false},
+               {name: "PermuteUint16x16", argLength: 2, commutative: false},
+               {name: "Permute2Uint16x16", argLength: 3, commutative: false},
+               {name: "Permute2Int16x16", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint16x16", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
+               {name: "PermuteMaskedUint16x16", argLength: 3, commutative: false},
+               {name: "PermuteMaskedInt16x16", argLength: 3, commutative: false},
                 {name: "PopCountUint16x16", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint16x16", argLength: 2, commutative: false},
                 {name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
@@ -932,6 +940,14 @@ func simdGenericOps() []opData {
                 {name: "MulHighMaskedUint16x32", argLength: 3, commutative: true},
                 {name: "NotEqualUint16x32", argLength: 2, commutative: true},
                 {name: "NotEqualMaskedUint16x32", argLength: 3, commutative: true},
+               {name: "PermuteUint16x32", argLength: 2, commutative: false},
+               {name: "PermuteInt16x32", argLength: 2, commutative: false},
+               {name: "Permute2Int16x32", argLength: 3, commutative: false},
+               {name: "Permute2Uint16x32", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint16x32", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt16x32", argLength: 4, commutative: false},
+               {name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
+               {name: "PermuteMaskedInt16x32", argLength: 3, commutative: false},
                 {name: "PopCountUint16x32", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint16x32", argLength: 2, commutative: false},
                 {name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
@@ -979,6 +995,14 @@ func simdGenericOps() []opData {
                 {name: "OrUint16x8", argLength: 2, commutative: true},
                 {name: "PairwiseAddUint16x8", argLength: 2, commutative: false},
                 {name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
+               {name: "PermuteUint16x8", argLength: 2, commutative: false},
+               {name: "PermuteInt16x8", argLength: 2, commutative: false},
+               {name: "Permute2Int16x8", argLength: 3, commutative: false},
+               {name: "Permute2Uint16x8", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint16x8", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt16x8", argLength: 4, commutative: false},
+               {name: "PermuteMaskedInt16x8", argLength: 3, commutative: false},
+               {name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
                 {name: "PopCountUint16x8", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint16x8", argLength: 2, commutative: false},
                 {name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
@@ -1024,6 +1048,18 @@ func simdGenericOps() []opData {
                 {name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true},
                 {name: "OrUint32x16", argLength: 2, commutative: true},
                 {name: "OrMaskedUint32x16", argLength: 3, commutative: true},
+               {name: "PermuteInt32x16", argLength: 2, commutative: false},
+               {name: "PermuteUint32x16", argLength: 2, commutative: false},
+               {name: "PermuteFloat32x16", argLength: 2, commutative: false},
+               {name: "Permute2Int32x16", argLength: 3, commutative: false},
+               {name: "Permute2Uint32x16", argLength: 3, commutative: false},
+               {name: "Permute2Float32x16", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt32x16", argLength: 4, commutative: false},
+               {name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false},
+               {name: "PermuteMaskedUint32x16", argLength: 3, commutative: false},
+               {name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
+               {name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false},
                 {name: "PopCountUint32x16", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint32x16", argLength: 2, commutative: false},
                 {name: "RotateLeftUint32x16", argLength: 2, commutative: false},
@@ -1077,6 +1113,12 @@ func simdGenericOps() []opData {
                 {name: "OrMaskedUint32x4", argLength: 3, commutative: true},
                 {name: "PairwiseAddUint32x4", argLength: 2, commutative: false},
                 {name: "PairwiseSubUint32x4", argLength: 2, commutative: false},
+               {name: "Permute2Uint32x4", argLength: 3, commutative: false},
+               {name: "Permute2Float32x4", argLength: 3, commutative: false},
+               {name: "Permute2Int32x4", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint32x4", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt32x4", argLength: 4, commutative: false},
+               {name: "Permute2MaskedFloat32x4", argLength: 4, commutative: false},
                 {name: "PopCountUint32x4", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint32x4", argLength: 2, commutative: false},
                 {name: "RotateLeftUint32x4", argLength: 2, commutative: false},
@@ -1130,6 +1172,18 @@ func simdGenericOps() []opData {
                 {name: "OrMaskedUint32x8", argLength: 3, commutative: true},
                 {name: "PairwiseAddUint32x8", argLength: 2, commutative: false},
                 {name: "PairwiseSubUint32x8", argLength: 2, commutative: false},
+               {name: "PermuteInt32x8", argLength: 2, commutative: false},
+               {name: "PermuteFloat32x8", argLength: 2, commutative: false},
+               {name: "PermuteUint32x8", argLength: 2, commutative: false},
+               {name: "Permute2Uint32x8", argLength: 3, commutative: false},
+               {name: "Permute2Float32x8", argLength: 3, commutative: false},
+               {name: "Permute2Int32x8", argLength: 3, commutative: false},
+               {name: "Permute2MaskedFloat32x8", argLength: 4, commutative: false},
+               {name: "Permute2MaskedUint32x8", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt32x8", argLength: 4, commutative: false},
+               {name: "PermuteMaskedInt32x8", argLength: 3, commutative: false},
+               {name: "PermuteMaskedFloat32x8", argLength: 3, commutative: false},
+               {name: "PermuteMaskedUint32x8", argLength: 3, commutative: false},
                 {name: "PopCountUint32x8", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint32x8", argLength: 2, commutative: false},
                 {name: "RotateLeftUint32x8", argLength: 2, commutative: false},
@@ -1182,6 +1236,12 @@ func simdGenericOps() []opData {
                 {name: "NotEqualMaskedUint64x2", argLength: 3, commutative: true},
                 {name: "OrUint64x2", argLength: 2, commutative: true},
                 {name: "OrMaskedUint64x2", argLength: 3, commutative: true},
+               {name: "Permute2Uint64x2", argLength: 3, commutative: false},
+               {name: "Permute2Int64x2", argLength: 3, commutative: false},
+               {name: "Permute2Float64x2", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint64x2", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt64x2", argLength: 4, commutative: false},
+               {name: "Permute2MaskedFloat64x2", argLength: 4, commutative: false},
                 {name: "PopCountUint64x2", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint64x2", argLength: 2, commutative: false},
                 {name: "RotateLeftUint64x2", argLength: 2, commutative: false},
@@ -1230,6 +1290,18 @@ func simdGenericOps() []opData {
                 {name: "NotEqualMaskedUint64x4", argLength: 3, commutative: true},
                 {name: "OrUint64x4", argLength: 2, commutative: true},
                 {name: "OrMaskedUint64x4", argLength: 3, commutative: true},
+               {name: "PermuteUint64x4", argLength: 2, commutative: false},
+               {name: "PermuteInt64x4", argLength: 2, commutative: false},
+               {name: "PermuteFloat64x4", argLength: 2, commutative: false},
+               {name: "Permute2Uint64x4", argLength: 3, commutative: false},
+               {name: "Permute2Int64x4", argLength: 3, commutative: false},
+               {name: "Permute2Float64x4", argLength: 3, commutative: false},
+               {name: "Permute2MaskedInt64x4", argLength: 4, commutative: false},
+               {name: "Permute2MaskedUint64x4", argLength: 4, commutative: false},
+               {name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false},
+               {name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false},
+               {name: "PermuteMaskedInt64x4", argLength: 3, commutative: false},
+               {name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
                 {name: "PopCountUint64x4", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint64x4", argLength: 2, commutative: false},
                 {name: "RotateLeftUint64x4", argLength: 2, commutative: false},
@@ -1278,6 +1350,18 @@ func simdGenericOps() []opData {
                 {name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true},
                 {name: "OrUint64x8", argLength: 2, commutative: true},
                 {name: "OrMaskedUint64x8", argLength: 3, commutative: true},
+               {name: "PermuteUint64x8", argLength: 2, commutative: false},
+               {name: "PermuteInt64x8", argLength: 2, commutative: false},
+               {name: "PermuteFloat64x8", argLength: 2, commutative: false},
+               {name: "Permute2Int64x8", argLength: 3, commutative: false},
+               {name: "Permute2Uint64x8", argLength: 3, commutative: false},
+               {name: "Permute2Float64x8", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint64x8", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt64x8", argLength: 4, commutative: false},
+               {name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
+               {name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
+               {name: "PermuteMaskedInt64x8", argLength: 3, commutative: false},
+               {name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
                 {name: "PopCountUint64x8", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint64x8", argLength: 2, commutative: false},
                 {name: "RotateLeftUint64x8", argLength: 2, commutative: false},
@@ -1325,6 +1409,14 @@ func simdGenericOps() []opData {
                 {name: "NotEqualUint8x16", argLength: 2, commutative: true},
                 {name: "NotEqualMaskedUint8x16", argLength: 3, commutative: true},
                 {name: "OrUint8x16", argLength: 2, commutative: true},
+               {name: "PermuteUint8x16", argLength: 2, commutative: false},
+               {name: "PermuteInt8x16", argLength: 2, commutative: false},
+               {name: "Permute2Uint8x16", argLength: 3, commutative: false},
+               {name: "Permute2Int8x16", argLength: 3, commutative: false},
+               {name: "Permute2MaskedInt8x16", argLength: 4, commutative: false},
+               {name: "Permute2MaskedUint8x16", argLength: 4, commutative: false},
+               {name: "PermuteMaskedInt8x16", argLength: 3, commutative: false},
+               {name: "PermuteMaskedUint8x16", argLength: 3, commutative: false},
                 {name: "PopCountUint8x16", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint8x16", argLength: 2, commutative: false},
                 {name: "SaturatedAddUint8x16", argLength: 2, commutative: true},
@@ -1361,6 +1453,14 @@ func simdGenericOps() []opData {
                 {name: "NotEqualUint8x32", argLength: 2, commutative: true},
                 {name: "NotEqualMaskedUint8x32", argLength: 3, commutative: true},
                 {name: "OrUint8x32", argLength: 2, commutative: true},
+               {name: "PermuteUint8x32", argLength: 2, commutative: false},
+               {name: "PermuteInt8x32", argLength: 2, commutative: false},
+               {name: "Permute2Int8x32", argLength: 3, commutative: false},
+               {name: "Permute2Uint8x32", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint8x32", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt8x32", argLength: 4, commutative: false},
+               {name: "PermuteMaskedUint8x32", argLength: 3, commutative: false},
+               {name: "PermuteMaskedInt8x32", argLength: 3, commutative: false},
                 {name: "PopCountUint8x32", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint8x32", argLength: 2, commutative: false},
                 {name: "SaturatedAddUint8x32", argLength: 2, commutative: true},
@@ -1394,6 +1494,14 @@ func simdGenericOps() []opData {
                 {name: "MinMaskedUint8x64", argLength: 3, commutative: true},
                 {name: "NotEqualUint8x64", argLength: 2, commutative: true},
                 {name: "NotEqualMaskedUint8x64", argLength: 3, commutative: true},
+               {name: "PermuteUint8x64", argLength: 2, commutative: false},
+               {name: "PermuteInt8x64", argLength: 2, commutative: false},
+               {name: "Permute2Int8x64", argLength: 3, commutative: false},
+               {name: "Permute2Uint8x64", argLength: 3, commutative: false},
+               {name: "Permute2MaskedUint8x64", argLength: 4, commutative: false},
+               {name: "Permute2MaskedInt8x64", argLength: 4, commutative: false},
+               {name: "PermuteMaskedInt8x64", argLength: 3, commutative: false},
+               {name: "PermuteMaskedUint8x64", argLength: 3, commutative: false},
                 {name: "PopCountUint8x64", argLength: 1, commutative: false},
                 {name: "PopCountMaskedUint8x64", argLength: 2, commutative: false},
                 {name: "SaturatedAddUint8x64", argLength: 2, commutative: true},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index ba28c58b7edf50dc69a356e51fd2a991c92bc46c..60a12e21fb198e4d12dcd0f6d8a82e1054dea8ba 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -1808,6 +1808,10 @@ const (
         OpAMD64VPMINUWMasked256
         OpAMD64VPMULHUW256
         OpAMD64VPMULHUWMasked256
+       OpAMD64VPERMW256
+       OpAMD64VPERMI2W256
+       OpAMD64VPERMI2WMasked256
+       OpAMD64VPERMWMasked256
         OpAMD64VPSRLW256
         OpAMD64VPSRLWMasked256
         OpAMD64VPSRLVW256
@@ -1820,6 +1824,10 @@ const (
         OpAMD64VPMINUWMasked512
         OpAMD64VPMULHUW512
         OpAMD64VPMULHUWMasked512
+       OpAMD64VPERMW512
+       OpAMD64VPERMI2W512
+       OpAMD64VPERMI2WMasked512
+       OpAMD64VPERMWMasked512
         OpAMD64VPSRLW512
         OpAMD64VPSRLWMasked512
         OpAMD64VPSRLVW512
@@ -1832,6 +1840,10 @@ const (
         OpAMD64VPMINUWMasked128
         OpAMD64VPMULHUW128
         OpAMD64VPMULHUWMasked128
+       OpAMD64VPERMW128
+       OpAMD64VPERMI2W128
+       OpAMD64VPERMI2WMasked128
+       OpAMD64VPERMWMasked128
         OpAMD64VPSRLW128
         OpAMD64VPSRLWMasked128
         OpAMD64VPSRLVW128
@@ -1840,6 +1852,14 @@ const (
         OpAMD64VPMAXUDMasked512
         OpAMD64VPMINUD512
         OpAMD64VPMINUDMasked512
+       OpAMD64VPERMPS512
+       OpAMD64VPERMD512
+       OpAMD64VPERMI2D512
+       OpAMD64VPERMI2PS512
+       OpAMD64VPERMI2DMasked512
+       OpAMD64VPERMI2PSMasked512
+       OpAMD64VPERMPSMasked512
+       OpAMD64VPERMDMasked512
         OpAMD64VPSRLD512
         OpAMD64VPSRLDMasked512
         OpAMD64VPSRLVD512
@@ -1849,6 +1869,10 @@ const (
         OpAMD64VPMINUD128
         OpAMD64VPMINUDMasked128
         OpAMD64VPMULUDQ128
+       OpAMD64VPERMI2D128
+       OpAMD64VPERMI2PS128
+       OpAMD64VPERMI2PSMasked128
+       OpAMD64VPERMI2DMasked128
         OpAMD64VPSRLD128
         OpAMD64VPSRLDMasked128
         OpAMD64VPSRLVD128
@@ -1858,6 +1882,14 @@ const (
         OpAMD64VPMINUD256
         OpAMD64VPMINUDMasked256
         OpAMD64VPMULUDQ256
+       OpAMD64VPERMD256
+       OpAMD64VPERMPS256
+       OpAMD64VPERMI2D256
+       OpAMD64VPERMI2PS256
+       OpAMD64VPERMI2PSMasked256
+       OpAMD64VPERMI2DMasked256
+       OpAMD64VPERMPSMasked256
+       OpAMD64VPERMDMasked256
         OpAMD64VPSRLD256
         OpAMD64VPSRLDMasked256
         OpAMD64VPSRLVD256
@@ -1867,6 +1899,10 @@ const (
         OpAMD64VPMINUQ128
         OpAMD64VPMINUQMasked128
         OpAMD64VPMULUDQMasked128
+       OpAMD64VPERMI2PD128
+       OpAMD64VPERMI2Q128
+       OpAMD64VPERMI2QMasked128
+       OpAMD64VPERMI2PDMasked128
         OpAMD64VPSRLQ128
         OpAMD64VPSRLQMasked128
         OpAMD64VPSRLVQ128
@@ -1876,6 +1912,14 @@ const (
         OpAMD64VPMINUQ256
         OpAMD64VPMINUQMasked256
         OpAMD64VPMULUDQMasked256
+       OpAMD64VPERMQ256
+       OpAMD64VPERMPD256
+       OpAMD64VPERMI2PD256
+       OpAMD64VPERMI2Q256
+       OpAMD64VPERMI2PDMasked256
+       OpAMD64VPERMI2QMasked256
+       OpAMD64VPERMPDMasked256
+       OpAMD64VPERMQMasked256
         OpAMD64VPSRLQ256
         OpAMD64VPSRLQMasked256
         OpAMD64VPSRLVQ256
@@ -1886,6 +1930,14 @@ const (
         OpAMD64VPMINUQMasked512
         OpAMD64VPMULUDQ512
         OpAMD64VPMULUDQMasked512
+       OpAMD64VPERMPD512
+       OpAMD64VPERMQ512
+       OpAMD64VPERMI2Q512
+       OpAMD64VPERMI2PD512
+       OpAMD64VPERMI2QMasked512
+       OpAMD64VPERMI2PDMasked512
+       OpAMD64VPERMPDMasked512
+       OpAMD64VPERMQMasked512
         OpAMD64VPSRLQ512
         OpAMD64VPSRLQMasked512
         OpAMD64VPSRLVQ512
@@ -1898,6 +1950,10 @@ const (
         OpAMD64VPMAXUBMasked128
         OpAMD64VPMINUB128
         OpAMD64VPMINUBMasked128
+       OpAMD64VPERMB128
+       OpAMD64VPERMI2B128
+       OpAMD64VPERMI2BMasked128
+       OpAMD64VPERMBMasked128
         OpAMD64VPMADDUBSW128
         OpAMD64VPMADDUBSWMasked128
         OpAMD64VPAVGB256
@@ -1908,6 +1964,10 @@ const (
         OpAMD64VPMAXUBMasked256
         OpAMD64VPMINUB256
         OpAMD64VPMINUBMasked256
+       OpAMD64VPERMB256
+       OpAMD64VPERMI2B256
+       OpAMD64VPERMI2BMasked256
+       OpAMD64VPERMBMasked256
         OpAMD64VPMADDUBSW256
         OpAMD64VPMADDUBSWMasked256
         OpAMD64VPAVGB512
@@ -1918,6 +1978,10 @@ const (
         OpAMD64VPMAXUBMasked512
         OpAMD64VPMINUB512
         OpAMD64VPMINUBMasked512
+       OpAMD64VPERMB512
+       OpAMD64VPERMI2B512
+       OpAMD64VPERMI2BMasked512
+       OpAMD64VPERMBMasked512
         OpAMD64VPMADDUBSW512
         OpAMD64VPMADDUBSWMasked512
         OpAMD64VRNDSCALEPS512
@@ -5207,6 +5271,14 @@ const (
         OpOrUint16x16
         OpPairwiseAddUint16x16
         OpPairwiseSubUint16x16
+       OpPermuteInt16x16
+       OpPermuteUint16x16
+       OpPermute2Uint16x16
+       OpPermute2Int16x16
+       OpPermute2MaskedUint16x16
+       OpPermute2MaskedInt16x16
+       OpPermuteMaskedUint16x16
+       OpPermuteMaskedInt16x16
         OpPopCountUint16x16
         OpPopCountMaskedUint16x16
         OpSaturatedAddUint16x16
@@ -5250,6 +5322,14 @@ const (
         OpMulHighMaskedUint16x32
         OpNotEqualUint16x32
         OpNotEqualMaskedUint16x32
+       OpPermuteUint16x32
+       OpPermuteInt16x32
+       OpPermute2Int16x32
+       OpPermute2Uint16x32
+       OpPermute2MaskedUint16x32
+       OpPermute2MaskedInt16x32
+       OpPermuteMaskedUint16x32
+       OpPermuteMaskedInt16x32
         OpPopCountUint16x32
         OpPopCountMaskedUint16x32
         OpSaturatedAddUint16x32
@@ -5297,6 +5377,14 @@ const (
         OpOrUint16x8
         OpPairwiseAddUint16x8
         OpPairwiseSubUint16x8
+       OpPermuteUint16x8
+       OpPermuteInt16x8
+       OpPermute2Int16x8
+       OpPermute2Uint16x8
+       OpPermute2MaskedUint16x8
+       OpPermute2MaskedInt16x8
+       OpPermuteMaskedInt16x8
+       OpPermuteMaskedUint16x8
         OpPopCountUint16x8
         OpPopCountMaskedUint16x8
         OpSaturatedAddUint16x8
@@ -5342,6 +5430,18 @@ const (
         OpNotEqualMaskedUint32x16
         OpOrUint32x16
         OpOrMaskedUint32x16
+       OpPermuteInt32x16
+       OpPermuteUint32x16
+       OpPermuteFloat32x16
+       OpPermute2Int32x16
+       OpPermute2Uint32x16
+       OpPermute2Float32x16
+       OpPermute2MaskedUint32x16
+       OpPermute2MaskedInt32x16
+       OpPermute2MaskedFloat32x16
+       OpPermuteMaskedUint32x16
+       OpPermuteMaskedInt32x16
+       OpPermuteMaskedFloat32x16
         OpPopCountUint32x16
         OpPopCountMaskedUint32x16
         OpRotateLeftUint32x16
@@ -5395,6 +5495,12 @@ const (
         OpOrMaskedUint32x4
         OpPairwiseAddUint32x4
         OpPairwiseSubUint32x4
+       OpPermute2Uint32x4
+       OpPermute2Float32x4
+       OpPermute2Int32x4
+       OpPermute2MaskedUint32x4
+       OpPermute2MaskedInt32x4
+       OpPermute2MaskedFloat32x4
         OpPopCountUint32x4
         OpPopCountMaskedUint32x4
         OpRotateLeftUint32x4
@@ -5448,6 +5554,18 @@ const (
         OpOrMaskedUint32x8
         OpPairwiseAddUint32x8
         OpPairwiseSubUint32x8
+       OpPermuteInt32x8
+       OpPermuteFloat32x8
+       OpPermuteUint32x8
+       OpPermute2Uint32x8
+       OpPermute2Float32x8
+       OpPermute2Int32x8
+       OpPermute2MaskedFloat32x8
+       OpPermute2MaskedUint32x8
+       OpPermute2MaskedInt32x8
+       OpPermuteMaskedInt32x8
+       OpPermuteMaskedFloat32x8
+       OpPermuteMaskedUint32x8
         OpPopCountUint32x8
         OpPopCountMaskedUint32x8
         OpRotateLeftUint32x8
@@ -5500,6 +5618,12 @@ const (
         OpNotEqualMaskedUint64x2
         OpOrUint64x2
         OpOrMaskedUint64x2
+       OpPermute2Uint64x2
+       OpPermute2Int64x2
+       OpPermute2Float64x2
+       OpPermute2MaskedUint64x2
+       OpPermute2MaskedInt64x2
+       OpPermute2MaskedFloat64x2
         OpPopCountUint64x2
         OpPopCountMaskedUint64x2
         OpRotateLeftUint64x2
@@ -5548,6 +5672,18 @@ const (
         OpNotEqualMaskedUint64x4
         OpOrUint64x4
         OpOrMaskedUint64x4
+       OpPermuteUint64x4
+       OpPermuteInt64x4
+       OpPermuteFloat64x4
+       OpPermute2Uint64x4
+       OpPermute2Int64x4
+       OpPermute2Float64x4
+       OpPermute2MaskedInt64x4
+       OpPermute2MaskedUint64x4
+       OpPermute2MaskedFloat64x4
+       OpPermuteMaskedFloat64x4
+       OpPermuteMaskedInt64x4
+       OpPermuteMaskedUint64x4
         OpPopCountUint64x4
         OpPopCountMaskedUint64x4
         OpRotateLeftUint64x4
@@ -5596,6 +5732,18 @@ const (
         OpNotEqualMaskedUint64x8
         OpOrUint64x8
         OpOrMaskedUint64x8
+       OpPermuteUint64x8
+       OpPermuteInt64x8
+       OpPermuteFloat64x8
+       OpPermute2Int64x8
+       OpPermute2Uint64x8
+       OpPermute2Float64x8
+       OpPermute2MaskedUint64x8
+       OpPermute2MaskedInt64x8
+       OpPermute2MaskedFloat64x8
+       OpPermuteMaskedFloat64x8
+       OpPermuteMaskedInt64x8
+       OpPermuteMaskedUint64x8
         OpPopCountUint64x8
         OpPopCountMaskedUint64x8
         OpRotateLeftUint64x8
@@ -5643,6 +5791,14 @@ const (
         OpNotEqualUint8x16
         OpNotEqualMaskedUint8x16
         OpOrUint8x16
+       OpPermuteUint8x16
+       OpPermuteInt8x16
+       OpPermute2Uint8x16
+       OpPermute2Int8x16
+       OpPermute2MaskedInt8x16
+       OpPermute2MaskedUint8x16
+       OpPermuteMaskedInt8x16
+       OpPermuteMaskedUint8x16
         OpPopCountUint8x16
         OpPopCountMaskedUint8x16
         OpSaturatedAddUint8x16
@@ -5679,6 +5835,14 @@ const (
         OpNotEqualUint8x32
         OpNotEqualMaskedUint8x32
         OpOrUint8x32
+       OpPermuteUint8x32
+       OpPermuteInt8x32
+       OpPermute2Int8x32
+       OpPermute2Uint8x32
+       OpPermute2MaskedUint8x32
+       OpPermute2MaskedInt8x32
+       OpPermuteMaskedUint8x32
+       OpPermuteMaskedInt8x32
         OpPopCountUint8x32
         OpPopCountMaskedUint8x32
         OpSaturatedAddUint8x32
@@ -5712,6 +5876,14 @@ const (
         OpMinMaskedUint8x64
         OpNotEqualUint8x64
         OpNotEqualMaskedUint8x64
+       OpPermuteUint8x64
+       OpPermuteInt8x64
+       OpPermute2Int8x64
+       OpPermute2Uint8x64
+       OpPermute2MaskedUint8x64
+       OpPermute2MaskedInt8x64
+       OpPermuteMaskedInt8x64
+       OpPermuteMaskedUint8x64
         OpPopCountUint8x64
         OpPopCountMaskedUint8x64
         OpSaturatedAddUint8x64
@@ -27735,6 +27907,68 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMW256",
+               argLen: 2,
+               asm:    x86.AVPERMW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2W256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2W,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2WMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2W,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMWMasked256",
+               argLen: 3,
+               asm:    x86.AVPERMW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLW256",
                 argLen: 2,
@@ -27917,6 +28151,68 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMW512",
+               argLen: 2,
+               asm:    x86.AVPERMW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2W512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2W,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2WMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2W,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMWMasked512",
+               argLen: 3,
+               asm:    x86.AVPERMW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLW512",
                 argLen: 2,
@@ -28099,6 +28395,68 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMW128",
+               argLen: 2,
+               asm:    x86.AVPERMW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2W128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2W,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2WMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2W,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMWMasked128",
+               argLen: 3,
+               asm:    x86.AVPERMW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLW128",
                 argLen: 2,
@@ -28219,6 +28577,130 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMPS512",
+               argLen: 2,
+               asm:    x86.AVPERMPS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPERMD512",
+               argLen: 2,
+               asm:    x86.AVPERMD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2D512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2D,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PS512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2DMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2D,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PSMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMPSMasked512",
+               argLen: 3,
+               asm:    x86.AVPERMPS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMDMasked512",
+               argLen: 3,
+               asm:    x86.AVPERMD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLD512",
                 argLen: 2,
@@ -28354,6 +28836,72 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:         "VPERMI2D128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2D,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PS128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PSMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2DMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2D,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLD128",
                 argLen: 2,
@@ -28489,6 +29037,130 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMD256",
+               argLen: 2,
+               asm:    x86.AVPERMD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMPS256",
+               argLen: 2,
+               asm:    x86.AVPERMPS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2D256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2D,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PS256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PSMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2DMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2D,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMPSMasked256",
+               argLen: 3,
+               asm:    x86.AVPERMPS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMDMasked256",
+               argLen: 3,
+               asm:    x86.AVPERMD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLD256",
                 argLen: 2,
@@ -28625,6 +29297,72 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:         "VPERMI2PD128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2Q128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2Q,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2QMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2Q,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PDMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLQ128",
                 argLen: 2,
@@ -28761,6 +29499,130 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMQ256",
+               argLen: 2,
+               asm:    x86.AVPERMQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPERMPD256",
+               argLen: 2,
+               asm:    x86.AVPERMPD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PD256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2Q256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2Q,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PDMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2QMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2Q,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMPDMasked256",
+               argLen: 3,
+               asm:    x86.AVPERMPD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMQMasked256",
+               argLen: 3,
+               asm:    x86.AVPERMQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLQ256",
                 argLen: 2,
@@ -28912,6 +29774,130 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMPD512",
+               argLen: 2,
+               asm:    x86.AVPERMPD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPERMQ512",
+               argLen: 2,
+               asm:    x86.AVPERMQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2Q512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2Q,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PD512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2QMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2Q,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2PDMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2PD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMPDMasked512",
+               argLen: 3,
+               asm:    x86.AVPERMPD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMQMasked512",
+               argLen: 3,
+               asm:    x86.AVPERMQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSRLQ512",
                 argLen: 2,
@@ -29092,6 +30078,68 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMB128",
+               argLen: 2,
+               asm:    x86.AVPERMB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2B128",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2B,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2BMasked128",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2B,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMBMasked128",
+               argLen: 3,
+               asm:    x86.AVPERMB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPMADDUBSW128",
                 argLen: 2,
@@ -29243,6 +30291,68 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMB256",
+               argLen: 2,
+               asm:    x86.AVPERMB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2B256",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2B,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2BMasked256",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2B,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMBMasked256",
+               argLen: 3,
+               asm:    x86.AVPERMB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPMADDUBSW256",
                 argLen: 2,
@@ -29394,6 +30504,68 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPERMB512",
+               argLen: 2,
+               asm:    x86.AVPERMB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2B512",
+               argLen:       3,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2B,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:         "VPERMI2BMasked512",
+               argLen:       4,
+               resultInArg0: true,
+               asm:          x86.AVPERMI2B,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {2, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPERMBMasked512",
+               argLen: 3,
+               asm:    x86.AVPERMB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPMADDUBSW512",
                 argLen: 2,
@@ -64012,6 +65184,46 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "PermuteInt16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteUint16x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint16x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int16x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint16x16",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt16x16",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint16x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt16x16",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint16x16",
                 argLen:  1,
@@ -64244,6 +65456,46 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "PermuteUint16x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteInt16x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int16x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint16x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint16x32",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt16x32",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint16x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt16x32",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint16x32",
                 argLen:  1,
@@ -64497,6 +65749,46 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "PermuteUint16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteInt16x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int16x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint16x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint16x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt16x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt16x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint16x8",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint16x8",
                 argLen:  1,
@@ -64739,6 +66031,66 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "PermuteInt32x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteUint32x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteFloat32x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Float32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint32x16",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt32x16",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedFloat32x16",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt32x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedFloat32x16",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint32x16",
                 argLen:  1,
@@ -65021,6 +66373,36 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "Permute2Uint32x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Float32x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int32x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint32x4",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt32x4",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedFloat32x4",
+               argLen:  4,
+               generic: true,
+       },
         {
                 name:    "PopCountUint32x4",
                 argLen:  1,
@@ -65303,6 +66685,66 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "PermuteInt32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteFloat32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteUint32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Float32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedFloat32x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint32x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt32x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedFloat32x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint32x8",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint32x8",
                 argLen:  1,
@@ -65581,6 +67023,36 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "Permute2Uint64x2",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int64x2",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Float64x2",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint64x2",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt64x2",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedFloat64x2",
+               argLen:  4,
+               generic: true,
+       },
         {
                 name:    "PopCountUint64x2",
                 argLen:  1,
@@ -65839,6 +67311,66 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "PermuteUint64x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteInt64x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteFloat64x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint64x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int64x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Float64x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt64x4",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint64x4",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedFloat64x4",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedFloat64x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt64x4",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint64x4",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint64x4",
                 argLen:  1,
@@ -66097,6 +67629,66 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "PermuteUint64x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteInt64x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteFloat64x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int64x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint64x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Float64x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint64x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt64x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedFloat64x8",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedFloat64x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt64x8",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint64x8",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint64x8",
                 argLen:  1,
@@ -66348,6 +67940,46 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "PermuteUint8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteInt8x16",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint8x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int8x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt8x16",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint8x16",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt8x16",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint8x16",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint8x16",
                 argLen:  1,
@@ -66545,6 +68177,46 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "PermuteUint8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteInt8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int8x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint8x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint8x32",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt8x32",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint8x32",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt8x32",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint8x32",
                 argLen:  1,
@@ -66725,6 +68397,46 @@ var opcodeTable = [...]opInfo{
                 commutative: true,
                 generic:     true,
         },
+       {
+               name:    "PermuteUint8x64",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteInt8x64",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "Permute2Int8x64",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2Uint8x64",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedUint8x64",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "Permute2MaskedInt8x64",
+               argLen:  4,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedInt8x64",
+               argLen:  3,
+               generic: true,
+       },
+       {
+               name:    "PermuteMaskedUint8x64",
+               argLen:  3,
+               generic: true,
+       },
         {
                 name:    "PopCountUint8x64",
                 argLen:  1,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index 6d10b009bb90cffbbebd4718d2823048b1fbc037..1aa36bee04202a52438c8739407578122d929ea1 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -3298,6 +3298,276 @@ func rewriteValueAMD64(v *Value) bool {
                 return true
         case OpPanicBounds:
                 return rewriteValueAMD64_OpPanicBounds(v)
+       case OpPermute2Float32x16:
+               v.Op = OpAMD64VPERMI2PS512
+               return true
+       case OpPermute2Float32x4:
+               v.Op = OpAMD64VPERMI2PS128
+               return true
+       case OpPermute2Float32x8:
+               v.Op = OpAMD64VPERMI2PS256
+               return true
+       case OpPermute2Float64x2:
+               v.Op = OpAMD64VPERMI2PD128
+               return true
+       case OpPermute2Float64x4:
+               v.Op = OpAMD64VPERMI2PD256
+               return true
+       case OpPermute2Float64x8:
+               v.Op = OpAMD64VPERMI2PD512
+               return true
+       case OpPermute2Int16x16:
+               v.Op = OpAMD64VPERMI2W256
+               return true
+       case OpPermute2Int16x32:
+               v.Op = OpAMD64VPERMI2W512
+               return true
+       case OpPermute2Int16x8:
+               v.Op = OpAMD64VPERMI2W128
+               return true
+       case OpPermute2Int32x16:
+               v.Op = OpAMD64VPERMI2D512
+               return true
+       case OpPermute2Int32x4:
+               v.Op = OpAMD64VPERMI2D128
+               return true
+       case OpPermute2Int32x8:
+               v.Op = OpAMD64VPERMI2D256
+               return true
+       case OpPermute2Int64x2:
+               v.Op = OpAMD64VPERMI2Q128
+               return true
+       case OpPermute2Int64x4:
+               v.Op = OpAMD64VPERMI2Q256
+               return true
+       case OpPermute2Int64x8:
+               v.Op = OpAMD64VPERMI2Q512
+               return true
+       case OpPermute2Int8x16:
+               v.Op = OpAMD64VPERMI2B128
+               return true
+       case OpPermute2Int8x32:
+               v.Op = OpAMD64VPERMI2B256
+               return true
+       case OpPermute2Int8x64:
+               v.Op = OpAMD64VPERMI2B512
+               return true
+       case OpPermute2MaskedFloat32x16:
+               return rewriteValueAMD64_OpPermute2MaskedFloat32x16(v)
+       case OpPermute2MaskedFloat32x4:
+               return rewriteValueAMD64_OpPermute2MaskedFloat32x4(v)
+       case OpPermute2MaskedFloat32x8:
+               return rewriteValueAMD64_OpPermute2MaskedFloat32x8(v)
+       case OpPermute2MaskedFloat64x2:
+               return rewriteValueAMD64_OpPermute2MaskedFloat64x2(v)
+       case OpPermute2MaskedFloat64x4:
+               return rewriteValueAMD64_OpPermute2MaskedFloat64x4(v)
+       case OpPermute2MaskedFloat64x8:
+               return rewriteValueAMD64_OpPermute2MaskedFloat64x8(v)
+       case OpPermute2MaskedInt16x16:
+               return rewriteValueAMD64_OpPermute2MaskedInt16x16(v)
+       case OpPermute2MaskedInt16x32:
+               return rewriteValueAMD64_OpPermute2MaskedInt16x32(v)
+       case OpPermute2MaskedInt16x8:
+               return rewriteValueAMD64_OpPermute2MaskedInt16x8(v)
+       case OpPermute2MaskedInt32x16:
+               return rewriteValueAMD64_OpPermute2MaskedInt32x16(v)
+       case OpPermute2MaskedInt32x4:
+               return rewriteValueAMD64_OpPermute2MaskedInt32x4(v)
+       case OpPermute2MaskedInt32x8:
+               return rewriteValueAMD64_OpPermute2MaskedInt32x8(v)
+       case OpPermute2MaskedInt64x2:
+               return rewriteValueAMD64_OpPermute2MaskedInt64x2(v)
+       case OpPermute2MaskedInt64x4:
+               return rewriteValueAMD64_OpPermute2MaskedInt64x4(v)
+       case OpPermute2MaskedInt64x8:
+               return rewriteValueAMD64_OpPermute2MaskedInt64x8(v)
+       case OpPermute2MaskedInt8x16:
+               return rewriteValueAMD64_OpPermute2MaskedInt8x16(v)
+       case OpPermute2MaskedInt8x32:
+               return rewriteValueAMD64_OpPermute2MaskedInt8x32(v)
+       case OpPermute2MaskedInt8x64:
+               return rewriteValueAMD64_OpPermute2MaskedInt8x64(v)
+       case OpPermute2MaskedUint16x16:
+               return rewriteValueAMD64_OpPermute2MaskedUint16x16(v)
+       case OpPermute2MaskedUint16x32:
+               return rewriteValueAMD64_OpPermute2MaskedUint16x32(v)
+       case OpPermute2MaskedUint16x8:
+               return rewriteValueAMD64_OpPermute2MaskedUint16x8(v)
+       case OpPermute2MaskedUint32x16:
+               return rewriteValueAMD64_OpPermute2MaskedUint32x16(v)
+       case OpPermute2MaskedUint32x4:
+               return rewriteValueAMD64_OpPermute2MaskedUint32x4(v)
+       case OpPermute2MaskedUint32x8:
+               return rewriteValueAMD64_OpPermute2MaskedUint32x8(v)
+       case OpPermute2MaskedUint64x2:
+               return rewriteValueAMD64_OpPermute2MaskedUint64x2(v)
+       case OpPermute2MaskedUint64x4:
+               return rewriteValueAMD64_OpPermute2MaskedUint64x4(v)
+       case OpPermute2MaskedUint64x8:
+               return rewriteValueAMD64_OpPermute2MaskedUint64x8(v)
+       case OpPermute2MaskedUint8x16:
+               return rewriteValueAMD64_OpPermute2MaskedUint8x16(v)
+       case OpPermute2MaskedUint8x32:
+               return rewriteValueAMD64_OpPermute2MaskedUint8x32(v)
+       case OpPermute2MaskedUint8x64:
+               return rewriteValueAMD64_OpPermute2MaskedUint8x64(v)
+       case OpPermute2Uint16x16:
+               v.Op = OpAMD64VPERMI2W256
+               return true
+       case OpPermute2Uint16x32:
+               v.Op = OpAMD64VPERMI2W512
+               return true
+       case OpPermute2Uint16x8:
+               v.Op = OpAMD64VPERMI2W128
+               return true
+       case OpPermute2Uint32x16:
+               v.Op = OpAMD64VPERMI2D512
+               return true
+       case OpPermute2Uint32x4:
+               v.Op = OpAMD64VPERMI2D128
+               return true
+       case OpPermute2Uint32x8:
+               v.Op = OpAMD64VPERMI2D256
+               return true
+       case OpPermute2Uint64x2:
+               v.Op = OpAMD64VPERMI2Q128
+               return true
+       case OpPermute2Uint64x4:
+               v.Op = OpAMD64VPERMI2Q256
+               return true
+       case OpPermute2Uint64x8:
+               v.Op = OpAMD64VPERMI2Q512
+               return true
+       case OpPermute2Uint8x16:
+               v.Op = OpAMD64VPERMI2B128
+               return true
+       case OpPermute2Uint8x32:
+               v.Op = OpAMD64VPERMI2B256
+               return true
+       case OpPermute2Uint8x64:
+               v.Op = OpAMD64VPERMI2B512
+               return true
+       case OpPermuteFloat32x16:
+               v.Op = OpAMD64VPERMPS512
+               return true
+       case OpPermuteFloat32x8:
+               v.Op = OpAMD64VPERMPS256
+               return true
+       case OpPermuteFloat64x4:
+               v.Op = OpAMD64VPERMPD256
+               return true
+       case OpPermuteFloat64x8:
+               v.Op = OpAMD64VPERMPD512
+               return true
+       case OpPermuteInt16x16:
+               v.Op = OpAMD64VPERMW256
+               return true
+       case OpPermuteInt16x32:
+               v.Op = OpAMD64VPERMW512
+               return true
+       case OpPermuteInt16x8:
+               v.Op = OpAMD64VPERMW128
+               return true
+       case OpPermuteInt32x16:
+               v.Op = OpAMD64VPERMD512
+               return true
+       case OpPermuteInt32x8:
+               v.Op = OpAMD64VPERMD256
+               return true
+       case OpPermuteInt64x4:
+               v.Op = OpAMD64VPERMQ256
+               return true
+       case OpPermuteInt64x8:
+               v.Op = OpAMD64VPERMQ512
+               return true
+       case OpPermuteInt8x16:
+               v.Op = OpAMD64VPERMB128
+               return true
+       case OpPermuteInt8x32:
+               v.Op = OpAMD64VPERMB256
+               return true
+       case OpPermuteInt8x64:
+               v.Op = OpAMD64VPERMB512
+               return true
+       case OpPermuteMaskedFloat32x16:
+               return rewriteValueAMD64_OpPermuteMaskedFloat32x16(v)
+       case OpPermuteMaskedFloat32x8:
+               return rewriteValueAMD64_OpPermuteMaskedFloat32x8(v)
+       case OpPermuteMaskedFloat64x4:
+               return rewriteValueAMD64_OpPermuteMaskedFloat64x4(v)
+       case OpPermuteMaskedFloat64x8:
+               return rewriteValueAMD64_OpPermuteMaskedFloat64x8(v)
+       case OpPermuteMaskedInt16x16:
+               return rewriteValueAMD64_OpPermuteMaskedInt16x16(v)
+       case OpPermuteMaskedInt16x32:
+               return rewriteValueAMD64_OpPermuteMaskedInt16x32(v)
+       case OpPermuteMaskedInt16x8:
+               return rewriteValueAMD64_OpPermuteMaskedInt16x8(v)
+       case OpPermuteMaskedInt32x16:
+               return rewriteValueAMD64_OpPermuteMaskedInt32x16(v)
+       case OpPermuteMaskedInt32x8:
+               return rewriteValueAMD64_OpPermuteMaskedInt32x8(v)
+       case OpPermuteMaskedInt64x4:
+               return rewriteValueAMD64_OpPermuteMaskedInt64x4(v)
+       case OpPermuteMaskedInt64x8:
+               return rewriteValueAMD64_OpPermuteMaskedInt64x8(v)
+       case OpPermuteMaskedInt8x16:
+               return rewriteValueAMD64_OpPermuteMaskedInt8x16(v)
+       case OpPermuteMaskedInt8x32:
+               return rewriteValueAMD64_OpPermuteMaskedInt8x32(v)
+       case OpPermuteMaskedInt8x64:
+               return rewriteValueAMD64_OpPermuteMaskedInt8x64(v)
+       case OpPermuteMaskedUint16x16:
+               return rewriteValueAMD64_OpPermuteMaskedUint16x16(v)
+       case OpPermuteMaskedUint16x32:
+               return rewriteValueAMD64_OpPermuteMaskedUint16x32(v)
+       case OpPermuteMaskedUint16x8:
+               return rewriteValueAMD64_OpPermuteMaskedUint16x8(v)
+       case OpPermuteMaskedUint32x16:
+               return rewriteValueAMD64_OpPermuteMaskedUint32x16(v)
+       case OpPermuteMaskedUint32x8:
+               return rewriteValueAMD64_OpPermuteMaskedUint32x8(v)
+       case OpPermuteMaskedUint64x4:
+               return rewriteValueAMD64_OpPermuteMaskedUint64x4(v)
+       case OpPermuteMaskedUint64x8:
+               return rewriteValueAMD64_OpPermuteMaskedUint64x8(v)
+       case OpPermuteMaskedUint8x16:
+               return rewriteValueAMD64_OpPermuteMaskedUint8x16(v)
+       case OpPermuteMaskedUint8x32:
+               return rewriteValueAMD64_OpPermuteMaskedUint8x32(v)
+       case OpPermuteMaskedUint8x64:
+               return rewriteValueAMD64_OpPermuteMaskedUint8x64(v)
+       case OpPermuteUint16x16:
+               v.Op = OpAMD64VPERMW256
+               return true
+       case OpPermuteUint16x32:
+               v.Op = OpAMD64VPERMW512
+               return true
+       case OpPermuteUint16x8:
+               v.Op = OpAMD64VPERMW128
+               return true
+       case OpPermuteUint32x16:
+               v.Op = OpAMD64VPERMD512
+               return true
+       case OpPermuteUint32x8:
+               v.Op = OpAMD64VPERMD256
+               return true
+       case OpPermuteUint64x4:
+               v.Op = OpAMD64VPERMQ256
+               return true
+       case OpPermuteUint64x8:
+               v.Op = OpAMD64VPERMQ512
+               return true
+       case OpPermuteUint8x16:
+               v.Op = OpAMD64VPERMB128
+               return true
+       case OpPermuteUint8x32:
+               v.Op = OpAMD64VPERMB256
+               return true
+       case OpPermuteUint8x64:
+               v.Op = OpAMD64VPERMB512
+               return true
         case OpPopCount16:
                 return rewriteValueAMD64_OpPopCount16(v)
         case OpPopCount32:
@@ -44315,6 +44585,1038 @@ func rewriteValueAMD64_OpPanicBounds(v *Value) bool {
         }
         return false
  }
+func rewriteValueAMD64_OpPermute2MaskedFloat32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedFloat32x16 x y z mask)
+       // result: (VPERMI2PSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2PSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedFloat32x4 x y z mask)
+       // result: (VPERMI2PSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2PSMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedFloat32x8 x y z mask)
+       // result: (VPERMI2PSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2PSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat64x2(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedFloat64x2 x y z mask)
+       // result: (VPERMI2PDMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2PDMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat64x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedFloat64x4 x y z mask)
+       // result: (VPERMI2PDMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2PDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedFloat64x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedFloat64x8 x y z mask)
+       // result: (VPERMI2PDMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2PDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt16x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt16x16 x y z mask)
+       // result: (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2WMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt16x32(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt16x32 x y z mask)
+       // result: (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2WMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt16x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt16x8 x y z mask)
+       // result: (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2WMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt32x16 x y z mask)
+       // result: (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2DMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt32x4 x y z mask)
+       // result: (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2DMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt32x8 x y z mask)
+       // result: (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2DMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt64x2(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt64x2 x y z mask)
+       // result: (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2QMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt64x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt64x4 x y z mask)
+       // result: (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2QMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt64x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt64x8 x y z mask)
+       // result: (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2QMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt8x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt8x16 x y z mask)
+       // result: (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2BMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt8x32(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt8x32 x y z mask)
+       // result: (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2BMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedInt8x64(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedInt8x64 x y z mask)
+       // result: (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2BMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint16x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint16x16 x y z mask)
+       // result: (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2WMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint16x32(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint16x32 x y z mask)
+       // result: (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2WMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint16x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint16x8 x y z mask)
+       // result: (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2WMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint32x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint32x16 x y z mask)
+       // result: (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2DMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint32x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint32x4 x y z mask)
+       // result: (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2DMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint32x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint32x8 x y z mask)
+       // result: (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2DMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint64x2(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint64x2 x y z mask)
+       // result: (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2QMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint64x4(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint64x4 x y z mask)
+       // result: (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2QMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint64x8(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint64x8 x y z mask)
+       // result: (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2QMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint8x16(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint8x16 x y z mask)
+       // result: (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2BMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint8x32(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint8x32 x y z mask)
+       // result: (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2BMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermute2MaskedUint8x64(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (Permute2MaskedUint8x64 x y z mask)
+       // result: (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               z := v_2
+               mask := v_3
+               v.reset(OpAMD64VPERMI2BMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg4(x, y, z, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedFloat32x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedFloat32x16 x y mask)
+       // result: (VPERMPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMPSMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedFloat32x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedFloat32x8 x y mask)
+       // result: (VPERMPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMPSMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedFloat64x4(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedFloat64x4 x y mask)
+       // result: (VPERMPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMPDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedFloat64x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedFloat64x8 x y mask)
+       // result: (VPERMPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMPDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt16x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt16x16 x y mask)
+       // result: (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt16x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt16x32 x y mask)
+       // result: (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt16x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt16x8 x y mask)
+       // result: (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt32x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt32x16 x y mask)
+       // result: (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt32x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt32x8 x y mask)
+       // result: (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt64x4(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt64x4 x y mask)
+       // result: (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMQMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt64x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt64x8 x y mask)
+       // result: (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMQMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt8x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt8x16 x y mask)
+       // result: (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt8x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt8x32 x y mask)
+       // result: (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedInt8x64(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedInt8x64 x y mask)
+       // result: (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint16x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint16x16 x y mask)
+       // result: (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMWMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint16x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint16x32 x y mask)
+       // result: (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMWMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint16x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint16x8 x y mask)
+       // result: (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMWMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint32x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint32x16 x y mask)
+       // result: (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMDMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint32x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint32x8 x y mask)
+       // result: (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMDMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint64x4(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint64x4 x y mask)
+       // result: (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMQMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint64x8(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint64x8 x y mask)
+       // result: (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMQMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint8x16(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint8x16 x y mask)
+       // result: (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMBMasked128)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint8x32(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint8x32 x y mask)
+       // result: (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMBMasked256)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
+func rewriteValueAMD64_OpPermuteMaskedUint8x64(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       b := v.Block
+       // match: (PermuteMaskedUint8x64 x y mask)
+       // result: (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+       for {
+               x := v_0
+               y := v_1
+               mask := v_2
+               v.reset(OpAMD64VPERMBMasked512)
+               v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+               v0.AddArg(mask)
+               v.AddArg3(x, y, v0)
+               return true
+       }
+}
  func rewriteValueAMD64_OpPopCount16(v *Value) bool {
         v_0 := v.Args[0]
         b := v.Block
diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go

index c47b0898150b97eb4ac1ba9090ab6f5e59af2e4d..fd7ebb20a340154416224b5a98b0457f7743d12d 100644 (file)
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -1622,18 +1622,42 @@ func opLen2(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa
         }
  }
  
+func opLen2_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+       return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               return s.newValue2(op, t, args[1], args[0])
+       }
+}
+
  func opLen3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
         return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                 return s.newValue3(op, t, args[0], args[1], args[2])
         }
  }
  
+func opLen3_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+       return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               return s.newValue3(op, t, args[1], args[0], args[2])
+       }
+}
+
+func opLen3_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+       return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               return s.newValue3(op, t, args[2], args[0], args[1])
+       }
+}
+
  func opLen4(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
         return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                 return s.newValue4(op, t, args[0], args[1], args[2], args[3])
         }
  }
  
+func opLen4_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+       return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+               return s.newValue4(op, t, args[2], args[0], args[1], args[3])
+       }
+}
+
  func plainPanicSimdImm(s *state) {
         cmp := s.newValue0(ssa.OpConstBool, types.Types[types.TBOOL])
         cmp.AuxInt = 0
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 58bc420fc4e3046b30157503efb2faade872767c..3805ca35a872c909fcbacf272757fe8e0bdbdc1d 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -996,6 +996,114 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x16, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Uint32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x64.Permute", opLen2_21(ssa.OpPermuteUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.Permute", opLen2_21(ssa.OpPermuteInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Permute", opLen2_21(ssa.OpPermuteUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.Permute", opLen2_21(ssa.OpPermuteInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x16.Permute", opLen2_21(ssa.OpPermuteUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.Permute", opLen2_21(ssa.OpPermuteInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x32.Permute", opLen2_21(ssa.OpPermuteUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x8.Permute", opLen2_21(ssa.OpPermuteFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.Permute", opLen2_21(ssa.OpPermuteInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x8.Permute", opLen2_21(ssa.OpPermuteUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x16.Permute", opLen2_21(ssa.OpPermuteFloat32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.Permute", opLen2_21(ssa.OpPermuteInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x16.Permute", opLen2_21(ssa.OpPermuteUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x4.Permute", opLen2_21(ssa.OpPermuteFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x4.Permute", opLen2_21(ssa.OpPermuteInt64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x4.Permute", opLen2_21(ssa.OpPermuteUint64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x64.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Float64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int8x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint8x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x16, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int8x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x64.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int16x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint16x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int16x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint16x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x16, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int16x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint16x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x32, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Float64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Float64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Int8x16.PopCount", opLen1(ssa.OpPopCountInt8x16, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int8x32.PopCount", opLen1(ssa.OpPopCountInt8x32, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Int8x64.PopCount", opLen1(ssa.OpPopCountInt8x64, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go

index 7a8780e5cba79026ff651f19a8d1d49f1625209d..29899f8cb13a9c0e2f473523eda134641e2f80f9 100644 (file)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -5391,6 +5391,830 @@ func (x Uint32x4) PairwiseSub(y Uint32x4) Uint32x4
  // Asm: VPHSUBD, CPU Feature: AVX2
  func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8
  
+/* Permute */
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute(indices Uint8x16) Int8x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x32) Permute(indices Uint8x32) Int8x32
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x64) Permute(indices Uint8x64) Int8x64
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x8) Permute(indices Uint16x8) Int16x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x16) Permute(indices Uint16x16) Int16x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x32) Permute(indices Uint16x32) Int16x32
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPS, CPU Feature: AVX2
+func (x Float32x8) Permute(indices Uint32x8) Float32x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX2
+func (x Int32x8) Permute(indices Uint32x8) Int32x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX2
+func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPS, CPU Feature: AVX512F
+func (x Float32x16) Permute(indices Uint32x16) Float32x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Int32x16) Permute(indices Uint32x16) Int32x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPD, CPU Feature: AVX512F
+func (x Float64x4) Permute(indices Uint64x4) Float64x4
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Int64x4) Permute(indices Uint64x4) Int64x4
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPD, CPU Feature: AVX512F
+func (x Float64x8) Permute(indices Uint64x8) Float64x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Int64x8) Permute(indices Uint64x8) Int64x8
+
+// Permute performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
+
+/* Permute2 */
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute2(y Int8x16, indices Uint8x16) Int8x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x16) Permute2(y Uint8x16, indices Uint8x16) Uint8x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x32) Permute2(y Int8x32, indices Uint8x32) Int8x32
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x32) Permute2(y Uint8x32, indices Uint8x32) Uint8x32
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x64) Permute2(y Int8x64, indices Uint8x64) Int8x64
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x64) Permute2(y Uint8x64, indices Uint8x64) Uint8x64
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x8) Permute2(y Int16x8, indices Uint16x8) Int16x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x8) Permute2(y Uint16x8, indices Uint16x8) Uint16x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x16) Permute2(y Int16x16, indices Uint16x16) Int16x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x16) Permute2(y Uint16x16, indices Uint16x16) Uint16x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x32) Permute2(y Int16x32, indices Uint16x32) Int16x32
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x32) Permute2(y Uint16x32, indices Uint16x32) Uint16x32
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x4) Permute2(y Float32x4, indices Uint32x4) Float32x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x4) Permute2(y Int32x4, indices Uint32x4) Int32x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x4) Permute2(y Uint32x4, indices Uint32x4) Uint32x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x8) Permute2(y Float32x8, indices Uint32x8) Float32x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x8) Permute2(y Int32x8, indices Uint32x8) Int32x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x8) Permute2(y Uint32x8, indices Uint32x8) Uint32x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x16) Permute2(y Float32x16, indices Uint32x16) Float32x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x16) Permute2(y Int32x16, indices Uint32x16) Int32x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x16) Permute2(y Uint32x16, indices Uint32x16) Uint32x16
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x2) Permute2(y Float64x2, indices Uint64x2) Float64x2
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x2) Permute2(y Int64x2, indices Uint64x2) Int64x2
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x2) Permute2(y Uint64x2, indices Uint64x2) Uint64x2
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x4) Permute2(y Float64x4, indices Uint64x4) Float64x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x4) Permute2(y Int64x4, indices Uint64x4) Int64x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x4) Permute2(y Uint64x4, indices Uint64x4) Uint64x4
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x8) Permute2(y Float64x8, indices Uint64x8) Float64x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
+
+// Permute2 performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
+
+/* Permute2Masked */
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute2Masked(y Int8x16, indices Uint8x16, u Mask8x16) Int8x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x16) Permute2Masked(y Uint8x16, indices Uint8x16, u Mask8x16) Uint8x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x32) Permute2Masked(y Int8x32, indices Uint8x32, u Mask8x32) Int8x32
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x32) Permute2Masked(y Uint8x32, indices Uint8x32, u Mask8x32) Uint8x32
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x64) Permute2Masked(y Int8x64, indices Uint8x64, u Mask8x64) Int8x64
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x64) Permute2Masked(y Uint8x64, indices Uint8x64, u Mask8x64) Uint8x64
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x8) Permute2Masked(y Int16x8, indices Uint16x8, u Mask16x8) Int16x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x8) Permute2Masked(y Uint16x8, indices Uint16x8, u Mask16x8) Uint16x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x16) Permute2Masked(y Int16x16, indices Uint16x16, u Mask16x16) Int16x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x16) Permute2Masked(y Uint16x16, indices Uint16x16, u Mask16x16) Uint16x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Int16x32) Permute2Masked(y Int16x32, indices Uint16x32, u Mask16x32) Int16x32
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512BW
+func (x Uint16x32) Permute2Masked(y Uint16x32, indices Uint16x32, u Mask16x32) Uint16x32
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x4) Permute2Masked(y Float32x4, indices Uint32x4, u Mask32x4) Float32x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x4) Permute2Masked(y Int32x4, indices Uint32x4, u Mask32x4) Int32x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x4) Permute2Masked(y Uint32x4, indices Uint32x4, u Mask32x4) Uint32x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x8) Permute2Masked(y Float32x8, indices Uint32x8, u Mask32x8) Float32x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x8) Permute2Masked(y Int32x8, indices Uint32x8, u Mask32x8) Int32x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x8) Permute2Masked(y Uint32x8, indices Uint32x8, u Mask32x8) Uint32x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512F
+func (x Float32x16) Permute2Masked(y Float32x16, indices Uint32x16, u Mask32x16) Float32x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Int32x16) Permute2Masked(y Int32x16, indices Uint32x16, u Mask32x16) Int32x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512F
+func (x Uint32x16) Permute2Masked(y Uint32x16, indices Uint32x16, u Mask32x16) Uint32x16
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x2) Permute2Masked(y Float64x2, indices Uint64x2, u Mask64x2) Float64x2
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x2) Permute2Masked(y Int64x2, indices Uint64x2, u Mask64x2) Int64x2
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x2) Permute2Masked(y Uint64x2, indices Uint64x2, u Mask64x2) Uint64x2
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x4) Permute2Masked(y Float64x4, indices Uint64x4, u Mask64x4) Float64x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x4) Permute2Masked(y Int64x4, indices Uint64x4, u Mask64x4) Int64x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x4) Permute2Masked(y Uint64x4, indices Uint64x4, u Mask64x4) Uint64x4
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512F
+func (x Float64x8) Permute2Masked(y Float64x8, indices Uint64x8, u Mask64x8) Float64x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Int64x8) Permute2Masked(y Int64x8, indices Uint64x8, u Mask64x8) Int64x8
+
+// Permute2Masked performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is x appending y.
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512F
+func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, u Mask64x8) Uint64x8
+
+/* PermuteMasked */
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x16) PermuteMasked(indices Uint8x16, z Mask8x16) Int8x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x16) PermuteMasked(indices Uint8x16, z Mask8x16) Uint8x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x32) PermuteMasked(indices Uint8x32, z Mask8x32) Int8x32
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x32) PermuteMasked(indices Uint8x32, z Mask8x32) Uint8x32
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x64) PermuteMasked(indices Uint8x64, z Mask8x64) Int8x64
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x64) PermuteMasked(indices Uint8x64, z Mask8x64) Uint8x64
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x8) PermuteMasked(indices Uint16x8, z Mask16x8) Int16x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x8) PermuteMasked(indices Uint16x8, z Mask16x8) Uint16x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x16) PermuteMasked(indices Uint16x16, z Mask16x16) Int16x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x16) PermuteMasked(indices Uint16x16, z Mask16x16) Uint16x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Int16x32) PermuteMasked(indices Uint16x32, z Mask16x32) Int16x32
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMW, CPU Feature: AVX512BW
+func (x Uint16x32) PermuteMasked(indices Uint16x32, z Mask16x32) Uint16x32
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPS, CPU Feature: AVX512F
+func (x Float32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Float32x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Int32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Int32x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Uint32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Uint32x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPS, CPU Feature: AVX512F
+func (x Float32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Float32x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Int32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Int32x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMD, CPU Feature: AVX512F
+func (x Uint32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Uint32x16
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPD, CPU Feature: AVX512F
+func (x Float64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Float64x4
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Int64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Int64x4
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Uint64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Uint64x4
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMPD, CPU Feature: AVX512F
+func (x Float64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Float64x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Int64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Int64x8
+
+// PermuteMasked performs a full permutation of vector y using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// Only the needed bits to represent x's index are used in indices' elements.
+//
+// Asm: VPERMQ, CPU Feature: AVX512F
+func (x Uint64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Uint64x8
+
  /* PopCount */
  
  // PopCount counts the number of set bits in each element.
diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go

index 36923319ff312c9f6a0144c21c4a4fae9aa3ad42..f1a2f11738c2e543221ef6f5119d605f61754557 100644 (file)
--- a/src/simd/simd_test.go
+++ b/src/simd/simd_test.go
@@ -151,6 +151,41 @@ func TestMaskedAdd(t *testing.T) {
         testInt32x4BinaryMasked(t, []int32{1, 2, 3, 4}, []int32{5, 6, 7, 8}, []int32{-1, -1, 0, 0}, []int32{6, 8, 0, 0}, "AddMasked")
  }
  
+func TestPermute(t *testing.T) {
+       if !simd.HasAVX512() {
+               t.Skip("Test requires HasAVX512, not available on this hardware")
+               return
+       }
+       x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
+       indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
+       want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
+       got := make([]int64, 8)
+       simd.LoadInt64x8Slice(x).Permute(simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+       for i := range 8 {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
+
+func TestPermute2(t *testing.T) {
+       if !simd.HasAVX512() {
+               t.Skip("Test requires HasAVX512, not available on this hardware")
+               return
+       }
+       x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
+       y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
+       indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
+       want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
+       got := make([]int64, 8)
+       simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+       for i := range 8 {
+               if want[i] != got[i] {
+                       t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+               }
+       }
+}
+
  // checkInt8Slices ensures that b and a are equal, to the end of b.
  // also serves to use the slices, to prevent accidental optimization.
  func checkInt8Slices(t *testing.T, a, b []int8) {
diff --git a/src/simd/simd_wrapped_test.go b/src/simd/simd_wrapped_test.go

index 6466684068e9647f4c3315d8676e34c2d8da0472..29452bdad0e9b4cbc53f1380d73f69b2b7015efa 100644 (file)
--- a/src/simd/simd_wrapped_test.go
+++ b/src/simd/simd_wrapped_test.go
@@ -7800,6 +7800,10 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6
  // GaloisFieldAffineTransformMasked
  // Get128
  // GetElem
+// Permute
+// Permute2
+// Permute2Masked
+// PermuteMasked
  // RotateAllLeft
  // RotateAllLeftMasked
  // RotateAllRight
author	Junyang Shao <shaojunyang@google.com>
	Mon, 14 Jul 2025 19:39:44 +0000 (19:39 +0000)
committer	Junyang Shao <shaojunyang@google.com>
	Tue, 15 Jul 2025 21:53:57 +0000 (14:53 -0700)
src/cmd/compile/internal/amd64/simdssa.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/intrinsics.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/simd/ops_amd64.go		patch \| blob \| history
src/simd/simd_test.go		patch \| blob \| history
src/simd/simd_wrapped_test.go		patch \| blob \| history