[dev.simd] cmd/compile, simd: complete AVX2? u?int shuffles

author Junyang Shao <shaojunyang@google.com>

Thu, 21 Aug 2025 20:37:57 +0000 (20:37 +0000)

committer Junyang Shao <shaojunyang@google.com>

Fri, 22 Aug 2025 16:10:28 +0000 (09:10 -0700)
author Junyang Shao <shaojunyang@google.com>
Thu, 21 Aug 2025 20:37:57 +0000 (20:37 +0000)
committer Junyang Shao <shaojunyang@google.com>
Fri, 22 Aug 2025 16:10:28 +0000 (09:10 -0700)
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go

index 5930ec99657d120a99a8cd178244feef4e1283ff..8698387235c17d4ef9c35e1385675f6107d257ee 100644 (file)
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -346,6 +346,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPERMQ256,
                 ssa.OpAMD64VPERMPD512,
                 ssa.OpAMD64VPERMQ512,
+               ssa.OpAMD64VPSHUFB256,
+               ssa.OpAMD64VPSHUFB512,
                 ssa.OpAMD64VPROLVD128,
                 ssa.OpAMD64VPROLVD256,
                 ssa.OpAMD64VPROLVD512,
@@ -606,6 +608,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPORQMasked128,
                 ssa.OpAMD64VPORQMasked256,
                 ssa.OpAMD64VPORQMasked512,
+               ssa.OpAMD64VPSHUFBMasked256,
+               ssa.OpAMD64VPSHUFBMasked512,
                 ssa.OpAMD64VPSHUFBMasked128,
                 ssa.OpAMD64VPERMBMasked256,
                 ssa.OpAMD64VPERMBMasked512,
@@ -903,6 +907,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VEXTRACTF64X4256,
                 ssa.OpAMD64VEXTRACTI128128,
                 ssa.OpAMD64VEXTRACTI64X4256,
+               ssa.OpAMD64VPSHUFD128,
+               ssa.OpAMD64VPSHUFD256,
+               ssa.OpAMD64VPSHUFD512,
+               ssa.OpAMD64VPSHUFHW128,
+               ssa.OpAMD64VPSHUFHW256,
+               ssa.OpAMD64VPSHUFHW512,
                 ssa.OpAMD64VPROLD128,
                 ssa.OpAMD64VPROLD256,
                 ssa.OpAMD64VPROLD512,
@@ -956,6 +966,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VREDUCEPDMasked128,
                 ssa.OpAMD64VREDUCEPDMasked256,
                 ssa.OpAMD64VREDUCEPDMasked512,
+               ssa.OpAMD64VPSHUFDMasked256,
+               ssa.OpAMD64VPSHUFDMasked512,
+               ssa.OpAMD64VPSHUFHWMasked256,
+               ssa.OpAMD64VPSHUFHWMasked512,
+               ssa.OpAMD64VPSHUFHWMasked128,
+               ssa.OpAMD64VPSHUFDMasked128,
                 ssa.OpAMD64VPROLDMasked128,
                 ssa.OpAMD64VPROLDMasked256,
                 ssa.OpAMD64VPROLDMasked512,
@@ -1682,6 +1698,14 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPERMI2QMasked256,
                 ssa.OpAMD64VPERMI2PDMasked512,
                 ssa.OpAMD64VPERMI2QMasked512,
+               ssa.OpAMD64VPSHUFDMasked256,
+               ssa.OpAMD64VPSHUFDMasked512,
+               ssa.OpAMD64VPSHUFHWMasked256,
+               ssa.OpAMD64VPSHUFHWMasked512,
+               ssa.OpAMD64VPSHUFHWMasked128,
+               ssa.OpAMD64VPSHUFDMasked128,
+               ssa.OpAMD64VPSHUFBMasked256,
+               ssa.OpAMD64VPSHUFBMasked512,
                 ssa.OpAMD64VPSHUFBMasked128,
                 ssa.OpAMD64VPERMBMasked256,
                 ssa.OpAMD64VPERMBMasked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index f1337d70be6ea08f0af28a346caa982276ea7b7d..5757278f6221a12f1a2cb15d2f166bedaf4c27b8 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -782,6 +782,32 @@
  (Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
  (Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
  (Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
+(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
+(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
+(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
+(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
+(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
+(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
+(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
+(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
+(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
+(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
+(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
+(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
+(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
+(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
+(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
+(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
+(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
+(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
+(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
+(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
+(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
+(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
+(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
+(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
+(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
+(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
  (ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
  (ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
  (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
@@ -1317,6 +1343,9 @@
  (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
  (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
  (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
+(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
+(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
  (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
  (VMOVDQU16Masked512 (VPERMW512 x y) mask) => (VPERMWMasked512 x y mask)
  (VMOVDQU32Masked512 (VPERMPS512 x y) mask) => (VPERMPSMasked512 x y mask)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go

index 96bb3ac032dd6ba3efeb21a83fe8c263d4c7a8ae..d473e2c2a90474c494d2bda716c144b6da959d1e 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@@ -816,7 +816,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPSHRDVWMasked256", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec256", resultInArg0: true},
                 {name: "VPSHRDVWMasked512", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec512", resultInArg0: true},
                 {name: "VPSHUFB128", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPSHUFB256", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSHUFB512", argLength: 2, reg: w21, asm: "VPSHUFB", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSHUFBMasked128", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPSHUFBMasked256", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSHUFBMasked512", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec512", resultInArg0: false},
                 {name: "VPSIGNB128", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPSIGNB256", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPSIGND128", argLength: 2, reg: v21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false},
@@ -1141,6 +1145,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                 {name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
                 {name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
                 {name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
+               {name: "VPSHUFD128", argLength: 1, reg: v11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPSHUFD256", argLength: 1, reg: v11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSHUFD512", argLength: 1, reg: w11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPSHUFDMasked256", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSHUFDMasked512", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPSHUFHW128", argLength: 1, reg: w11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPSHUFHW256", argLength: 1, reg: v11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSHUFHW512", argLength: 1, reg: w11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPSHUFDMasked128", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPROLD128", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
                 {name: "VPROLD256", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
                 {name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index 498c693e3cb1e1e62014e360eaa48d888ee39c5f..774fb5cce7e13ed86d36b7b578a7f2aa20f0e8f7 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -726,6 +726,10 @@ func simdGenericOps() []opData {
                 {name: "PermuteFloat32x16", argLength: 2, commutative: false},
                 {name: "PermuteFloat64x4", argLength: 2, commutative: false},
                 {name: "PermuteFloat64x8", argLength: 2, commutative: false},
+               {name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
+               {name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
+               {name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
+               {name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
                 {name: "PermuteInt8x16", argLength: 2, commutative: false},
                 {name: "PermuteInt8x32", argLength: 2, commutative: false},
                 {name: "PermuteInt8x64", argLength: 2, commutative: false},
@@ -1089,6 +1093,28 @@ func simdGenericOps() []opData {
                 {name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
+               {name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
                 {name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index 9212b17a35f44afe57fa320cdf5fd8598a68162c..cb496a424454195c0f02979d068e1c2cfeefdd89 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2039,7 +2039,11 @@ const (
         OpAMD64VPSHRDVWMasked256
         OpAMD64VPSHRDVWMasked512
         OpAMD64VPSHUFB128
+       OpAMD64VPSHUFB256
+       OpAMD64VPSHUFB512
         OpAMD64VPSHUFBMasked128
+       OpAMD64VPSHUFBMasked256
+       OpAMD64VPSHUFBMasked512
         OpAMD64VPSIGNB128
         OpAMD64VPSIGNB256
         OpAMD64VPSIGND128
@@ -2364,6 +2368,18 @@ const (
         OpAMD64VPCMPW512
         OpAMD64VPCMPD512
         OpAMD64VPCMPQ512
+       OpAMD64VPSHUFD128
+       OpAMD64VPSHUFD256
+       OpAMD64VPSHUFD512
+       OpAMD64VPSHUFDMasked256
+       OpAMD64VPSHUFDMasked512
+       OpAMD64VPSHUFHW128
+       OpAMD64VPSHUFHW256
+       OpAMD64VPSHUFHW512
+       OpAMD64VPSHUFHWMasked256
+       OpAMD64VPSHUFHWMasked512
+       OpAMD64VPSHUFHWMasked128
+       OpAMD64VPSHUFDMasked128
         OpAMD64VPROLD128
         OpAMD64VPROLD256
         OpAMD64VPROLD512
@@ -5505,6 +5521,10 @@ const (
         OpPermuteFloat32x16
         OpPermuteFloat64x4
         OpPermuteFloat64x8
+       OpPermuteGroupedInt8x32
+       OpPermuteGroupedInt8x64
+       OpPermuteGroupedUint8x32
+       OpPermuteGroupedUint8x64
         OpPermuteInt8x16
         OpPermuteInt8x32
         OpPermuteInt8x64
@@ -5868,6 +5888,28 @@ const (
         OpGetElemUint16x8
         OpGetElemUint32x4
         OpGetElemUint64x2
+       OpPermuteConstantGroupedInt32x8
+       OpPermuteConstantGroupedInt32x16
+       OpPermuteConstantGroupedUint32x8
+       OpPermuteConstantGroupedUint32x16
+       OpPermuteConstantHiGroupedInt16x16
+       OpPermuteConstantHiGroupedInt16x32
+       OpPermuteConstantHiGroupedUint16x16
+       OpPermuteConstantHiGroupedUint16x32
+       OpPermuteConstantHiInt16x8
+       OpPermuteConstantHiInt32x4
+       OpPermuteConstantHiUint16x8
+       OpPermuteConstantHiUint32x4
+       OpPermuteConstantInt32x4
+       OpPermuteConstantLoGroupedInt16x16
+       OpPermuteConstantLoGroupedInt16x32
+       OpPermuteConstantLoGroupedUint16x16
+       OpPermuteConstantLoGroupedUint16x32
+       OpPermuteConstantLoInt16x8
+       OpPermuteConstantLoInt32x4
+       OpPermuteConstantLoUint16x8
+       OpPermuteConstantLoUint32x4
+       OpPermuteConstantUint32x4
         OpRotateAllLeftInt32x4
         OpRotateAllLeftInt32x8
         OpRotateAllLeftInt32x16
@@ -31031,6 +31073,34 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPSHUFB256",
+               argLen: 2,
+               asm:    x86.AVPSHUFB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPSHUFB512",
+               argLen: 2,
+               asm:    x86.AVPSHUFB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
         {
                 name:   "VPSHUFBMasked128",
                 argLen: 3,
@@ -31046,6 +31116,36 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:   "VPSHUFBMasked256",
+               argLen: 3,
+               asm:    x86.AVPSHUFB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPSHUFBMasked512",
+               argLen: 3,
+               asm:    x86.AVPSHUFB,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:   "VPSIGNB128",
                 argLen: 2,
@@ -35810,6 +35910,180 @@ var opcodeTable = [...]opInfo{
                         },
                 },
         },
+       {
+               name:    "VPSHUFD128",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFD256",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFD512",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFDMasked256",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFDMasked512",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFHW128",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFHW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFHW256",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFHW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFHW512",
+               auxType: auxUInt8,
+               argLen:  1,
+               asm:     x86.AVPSHUFHW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFHWMasked256",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFHW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFHWMasked512",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFHW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFHWMasked128",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFHW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:    "VPSHUFDMasked128",
+               auxType: auxUInt8,
+               argLen:  2,
+               asm:     x86.AVPSHUFD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
         {
                 name:    "VPROLD128",
                 auxType: auxUInt8,
@@ -69053,6 +69327,26 @@ var opcodeTable = [...]opInfo{
                 argLen:  2,
                 generic: true,
         },
+       {
+               name:    "PermuteGroupedInt8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteGroupedInt8x64",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteGroupedUint8x32",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "PermuteGroupedUint8x64",
+               argLen:  2,
+               generic: true,
+       },
         {
                 name:    "PermuteInt8x16",
                 argLen:  2,
@@ -70932,6 +71226,138 @@ var opcodeTable = [...]opInfo{
                 argLen:  1,
                 generic: true,
         },
+       {
+               name:    "PermuteConstantGroupedInt32x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantGroupedInt32x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantGroupedUint32x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantGroupedUint32x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantHiGroupedInt16x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantHiGroupedInt16x32",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantHiGroupedUint16x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantHiGroupedUint16x32",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantHiInt16x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantHiInt32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantHiUint16x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantHiUint32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantInt32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantLoGroupedInt16x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantLoGroupedInt16x32",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantLoGroupedUint16x16",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantLoGroupedUint16x32",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantLoInt16x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantLoInt32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantLoUint16x8",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantLoUint32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "PermuteConstantUint32x4",
+               auxType: auxUInt8,
+               argLen:  1,
+               generic: true,
+       },
         {
                 name:    "RotateAllLeftInt32x4",
                 auxType: auxUInt8,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index e31b5f981f2a58ac63cc1b84f12a8251bebe809a..77ae32519a45072f3e113ae68bec3e38271f653c 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -3223,6 +3223,72 @@ func rewriteValueAMD64(v *Value) bool {
         case OpPermute2Uint8x64:
                 v.Op = OpAMD64VPERMI2B512
                 return true
+       case OpPermuteConstantGroupedInt32x16:
+               v.Op = OpAMD64VPSHUFD512
+               return true
+       case OpPermuteConstantGroupedInt32x8:
+               v.Op = OpAMD64VPSHUFD256
+               return true
+       case OpPermuteConstantGroupedUint32x16:
+               v.Op = OpAMD64VPSHUFD512
+               return true
+       case OpPermuteConstantGroupedUint32x8:
+               v.Op = OpAMD64VPSHUFD256
+               return true
+       case OpPermuteConstantHiGroupedInt16x16:
+               v.Op = OpAMD64VPSHUFHW256
+               return true
+       case OpPermuteConstantHiGroupedInt16x32:
+               v.Op = OpAMD64VPSHUFHW512
+               return true
+       case OpPermuteConstantHiGroupedUint16x16:
+               v.Op = OpAMD64VPSHUFHW256
+               return true
+       case OpPermuteConstantHiGroupedUint16x32:
+               v.Op = OpAMD64VPSHUFHW512
+               return true
+       case OpPermuteConstantHiInt16x8:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OpPermuteConstantHiInt32x4:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OpPermuteConstantHiUint16x8:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OpPermuteConstantHiUint32x4:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OpPermuteConstantInt32x4:
+               v.Op = OpAMD64VPSHUFD128
+               return true
+       case OpPermuteConstantLoGroupedInt16x16:
+               v.Op = OpAMD64VPSHUFHW256
+               return true
+       case OpPermuteConstantLoGroupedInt16x32:
+               v.Op = OpAMD64VPSHUFHW512
+               return true
+       case OpPermuteConstantLoGroupedUint16x16:
+               v.Op = OpAMD64VPSHUFHW256
+               return true
+       case OpPermuteConstantLoGroupedUint16x32:
+               v.Op = OpAMD64VPSHUFHW512
+               return true
+       case OpPermuteConstantLoInt16x8:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OpPermuteConstantLoInt32x4:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OpPermuteConstantLoUint16x8:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OpPermuteConstantLoUint32x4:
+               v.Op = OpAMD64VPSHUFHW128
+               return true
+       case OpPermuteConstantUint32x4:
+               v.Op = OpAMD64VPSHUFD128
+               return true
         case OpPermuteFloat32x16:
                 v.Op = OpAMD64VPERMPS512
                 return true
@@ -3235,6 +3301,18 @@ func rewriteValueAMD64(v *Value) bool {
         case OpPermuteFloat64x8:
                 v.Op = OpAMD64VPERMPD512
                 return true
+       case OpPermuteGroupedInt8x32:
+               v.Op = OpAMD64VPSHUFB256
+               return true
+       case OpPermuteGroupedInt8x64:
+               v.Op = OpAMD64VPSHUFB512
+               return true
+       case OpPermuteGroupedUint8x32:
+               v.Op = OpAMD64VPSHUFB256
+               return true
+       case OpPermuteGroupedUint8x64:
+               v.Op = OpAMD64VPSHUFB512
+               return true
         case OpPermuteInt16x16:
                 v.Op = OpAMD64VPERMW256
                 return true
@@ -26618,6 +26696,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool {
                 v.AddArg4(x, y, z, mask)
                 return true
         }
+       // match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask)
+       // result: (VPSHUFHWMasked512 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFHW512 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFHWMasked512)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
         // match: (VMOVDQU16Masked512 (VPERMW512 x y) mask)
         // result: (VPERMWMasked512 x y mask)
         for {
@@ -27311,6 +27403,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
                 v.AddArg4(x, y, z, mask)
                 return true
         }
+       // match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask)
+       // result: (VPSHUFDMasked512 [a] x mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFD512 {
+                       break
+               }
+               a := auxIntToUint8(v_0.AuxInt)
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFDMasked512)
+               v.AuxInt = uint8ToAuxInt(a)
+               v.AddArg2(x, mask)
+               return true
+       }
         // match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask)
         // result: (VPERMPSMasked512 x y mask)
         for {
@@ -28610,6 +28716,19 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool {
                 v.AddArg4(x, y, z, mask)
                 return true
         }
+       // match: (VMOVDQU8Masked512 (VPSHUFB512 x y) mask)
+       // result: (VPSHUFBMasked512 x y mask)
+       for {
+               if v_0.Op != OpAMD64VPSHUFB512 {
+                       break
+               }
+               y := v_0.Args[1]
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPSHUFBMasked512)
+               v.AddArg3(x, y, mask)
+               return true
+       }
         // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask)
         // result: (VPERMBMasked512 x y mask)
         for {
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 1c2b22a7fea80fe09a57944c4a5d35517c692d47..4ce329e1a4a3e30f8ba33012bc3901c8138fe2f1 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -794,6 +794,32 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
+       addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+       addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
+       addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
         addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml

index a576829e8f4cbf38c1a408115f465ba53e7a3506..556562b51abeaf2df1424639bee9064839ae11de 100644 (file)
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@@ -74,4 +74,32 @@
    commutative: false
    documentation: !string |-
      // NAME copies element zero of its (128-bit) input to all elements of
-    // the 512-bit output vector.
-\ No newline at end of file
+    // the 512-bit output vector.
+- go: PermuteGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using indices:
+- go: PermuteConstant
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a permutation of vector x using constant indices:
+- go: PermuteConstantGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using constant indices:
+- go: PermuteConstantLo
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a permutation of vector x using constant indices:
+- go: PermuteConstantLoGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using constant indices:
+- go: PermuteConstantHi
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a permutation of vector x using constant indices:
+- go: PermuteConstantHiGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using constant indices:
+\ No newline at end of file
diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml

index 3cdb9efe27b364367140329f2b1ecded82b57e6c..3d471ec480a37edf4f1df4dd8e932055e49a89ac 100644 (file)
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@@ -432,4 +432,98 @@
      go: $t
      name: indices
    out:
-  - *128any
-\ No newline at end of file
+  - *128any
+- go: PermuteGrouped
+  asm: VPSHUFB
+  addDoc: !string |-
+    // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    // Only the needed bits to represent the index of a group of x are used in indices' elements.
+    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // Each group is of size 128-bit.
+  in:
+  - &256Or512any
+    bits: "256|512"
+    go: $t
+  - bits: "256|512"
+    go: $t
+    name: indices
+  out:
+  - *256Or512any
+
+- go: PermuteConstant
+  asm: VPSHUFD
+  addDoc: !string |-
+    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+  in:
+  - *128any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *128any
+- go: PermuteConstantGrouped
+  asm: VPSHUFD
+  addDoc: !string |-
+    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // Each group is of size 128-bit.
+  in:
+  - *256Or512any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *256Or512any
+
+- go: PermuteConstantLo
+  asm: VPSHUFHW
+  addDoc: !string |-
+    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+  in:
+    - *128any
+    - class: immediate
+      immOffset: 0
+      name: indices
+  out:
+    - *128any
+- go: PermuteConstantLoGrouped
+  asm: VPSHUFHW
+  addDoc: !string |-
+    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // Each group is of size 128-bit.
+  in:
+  - *256Or512any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *256Or512any
+
+- go: PermuteConstantHi
+  asm: VPSHUFHW
+  addDoc: !string |-
+    // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+  in:
+  - *128any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *128any
+- go: PermuteConstantHiGrouped
+  asm: VPSHUFHW
+  addDoc: !string |-
+    // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // Each group is of size 128-bit.
+  in:
+  - *256Or512any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  out:
+  - *256Or512any
+\ No newline at end of file
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go

index e0e580bd27a63f37aae4d9e258ff815990342630..e600f7c1a0c9b92a2aac990a134bffb0005984cc 100644 (file)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -4564,6 +4564,266 @@ func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
  // Asm: VPERMI2Q, CPU Feature: AVX512
  func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
  
+/* PermuteConstant */
+
+// PermuteConstant performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) PermuteConstant(indices uint8) Int32x4
+
+// PermuteConstant performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4
+
+/* PermuteConstantGrouped */
+
+// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8
+
+// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16
+
+// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8
+
+// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16
+
+/* PermuteConstantHi */
+
+// PermuteConstantHi performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8
+
+// PermuteConstantHi performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX
+func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4
+
+// PermuteConstantHi performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8
+
+// PermuteConstantHi performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX
+func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4
+
+/* PermuteConstantHiGrouped */
+
+// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16
+
+// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32
+
+// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16
+
+// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32
+
+/* PermuteConstantLo */
+
+// PermuteConstantLo performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8
+
+// PermuteConstantLo performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX
+func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4
+
+// PermuteConstantLo performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8
+
+// PermuteConstantLo performs a permutation of vector x using constant indices:
+// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX
+func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4
+
+/* PermuteConstantLoGrouped */
+
+// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16
+
+// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32
+
+// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16
+
+// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
+// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32
+
+/* PermuteGrouped */
+
+// PermuteGrouped performs a grouped permutation of vector x using indices:
+// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// Only the needed bits to represent the index of a group of x are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX2
+func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32
+
+// PermuteGrouped performs a grouped permutation of vector x using indices:
+// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// Only the needed bits to represent the index of a group of x are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX512
+func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64
+
+// PermuteGrouped performs a grouped permutation of vector x using indices:
+// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// Only the needed bits to represent the index of a group of x are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX2
+func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32
+
+// PermuteGrouped performs a grouped permutation of vector x using indices:
+// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// Only the needed bits to represent the index of a group of x are used in indices' elements.
+// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX512
+func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64
+
  /* Reciprocal */
  
  // Reciprocal computes an approximate reciprocal of each element.
author	Junyang Shao <shaojunyang@google.com>
	Thu, 21 Aug 2025 20:37:57 +0000 (20:37 +0000)
committer	Junyang Shao <shaojunyang@google.com>
	Fri, 22 Aug 2025 16:10:28 +0000 (09:10 -0700)
src/cmd/compile/internal/amd64/simdssa.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/simd/_gen/simdgen/ops/Moves/categories.yaml		patch \| blob \| history
src/simd/_gen/simdgen/ops/Moves/go.yaml		patch \| blob \| history
src/simd/ops_amd64.go		patch \| blob \| history