From baea0c700b70d90331be3370f89991d7428d92aa Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Thu, 21 Aug 2025 20:37:57 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: complete AVX2? u?int shuffles The namings follow the following convention: - If its indices are from constant, amend "Constant" to the name. - If its indices are used by multiple groups, mend "Grouped" to the name. - If its indexing only the low part, amend "Lo", similarly "Hi". Change-Id: I6a58f5dae54c882ebd59f39b5288f6f3f14d957f Reviewed-on: https://go-review.googlesource.com/c/go/+/698296 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- src/cmd/compile/internal/amd64/simdssa.go | 24 + .../compile/internal/ssa/_gen/simdAMD64.rules | 29 ++ .../compile/internal/ssa/_gen/simdAMD64ops.go | 16 + .../internal/ssa/_gen/simdgenericOps.go | 26 ++ src/cmd/compile/internal/ssa/opGen.go | 426 ++++++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 119 +++++ .../compile/internal/ssagen/simdintrinsics.go | 26 ++ .../_gen/simdgen/ops/Moves/categories.yaml | 30 +- src/simd/_gen/simdgen/ops/Moves/go.yaml | 96 +++- src/simd/ops_amd64.go | 260 +++++++++++ 10 files changed, 1050 insertions(+), 2 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 5930ec9965..8698387235 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -346,6 +346,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMQ256, ssa.OpAMD64VPERMPD512, ssa.OpAMD64VPERMQ512, + ssa.OpAMD64VPSHUFB256, + ssa.OpAMD64VPSHUFB512, ssa.OpAMD64VPROLVD128, ssa.OpAMD64VPROLVD256, ssa.OpAMD64VPROLVD512, @@ -606,6 +608,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPORQMasked128, ssa.OpAMD64VPORQMasked256, ssa.OpAMD64VPORQMasked512, + ssa.OpAMD64VPSHUFBMasked256, + ssa.OpAMD64VPSHUFBMasked512, ssa.OpAMD64VPSHUFBMasked128, ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked512, @@ -903,6 +907,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VEXTRACTF64X4256, ssa.OpAMD64VEXTRACTI128128, ssa.OpAMD64VEXTRACTI64X4256, + ssa.OpAMD64VPSHUFD128, + ssa.OpAMD64VPSHUFD256, + ssa.OpAMD64VPSHUFD512, + ssa.OpAMD64VPSHUFHW128, + ssa.OpAMD64VPSHUFHW256, + ssa.OpAMD64VPSHUFHW512, ssa.OpAMD64VPROLD128, ssa.OpAMD64VPROLD256, ssa.OpAMD64VPROLD512, @@ -956,6 +966,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VREDUCEPDMasked128, ssa.OpAMD64VREDUCEPDMasked256, ssa.OpAMD64VREDUCEPDMasked512, + ssa.OpAMD64VPSHUFDMasked256, + ssa.OpAMD64VPSHUFDMasked512, + ssa.OpAMD64VPSHUFHWMasked256, + ssa.OpAMD64VPSHUFHWMasked512, + ssa.OpAMD64VPSHUFHWMasked128, + ssa.OpAMD64VPSHUFDMasked128, ssa.OpAMD64VPROLDMasked128, ssa.OpAMD64VPROLDMasked256, ssa.OpAMD64VPROLDMasked512, @@ -1682,6 +1698,14 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMI2QMasked256, ssa.OpAMD64VPERMI2PDMasked512, ssa.OpAMD64VPERMI2QMasked512, + ssa.OpAMD64VPSHUFDMasked256, + ssa.OpAMD64VPSHUFDMasked512, + ssa.OpAMD64VPSHUFHWMasked256, + ssa.OpAMD64VPSHUFHWMasked512, + ssa.OpAMD64VPSHUFHWMasked128, + ssa.OpAMD64VPSHUFDMasked128, + ssa.OpAMD64VPSHUFBMasked256, + ssa.OpAMD64VPSHUFBMasked512, ssa.OpAMD64VPSHUFBMasked128, ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index f1337d70be..5757278f62 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -782,6 +782,32 @@ (Permute2Uint64x2 ...) => (VPERMI2Q128 ...) (Permute2Uint64x4 ...) => (VPERMI2Q256 ...) (Permute2Uint64x8 ...) => (VPERMI2Q512 ...) +(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...) +(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...) +(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...) +(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...) +(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...) +(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...) +(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...) +(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...) +(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...) +(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...) +(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...) +(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...) +(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...) +(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...) +(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...) +(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...) +(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...) +(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...) +(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...) +(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...) +(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...) +(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...) +(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...) +(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...) +(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...) +(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...) (ReciprocalFloat32x4 ...) => (VRCPPS128 ...) (ReciprocalFloat32x8 ...) => (VRCPPS256 ...) (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...) @@ -1317,6 +1343,9 @@ (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask) (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask) (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask) +(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask) +(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask) +(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask) (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask) (VMOVDQU16Masked512 (VPERMW512 x y) mask) => (VPERMWMasked512 x y mask) (VMOVDQU32Masked512 (VPERMPS512 x y) mask) => (VPERMPSMasked512 x y mask) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 96bb3ac032..d473e2c2a9 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -816,7 +816,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHRDVWMasked256", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSHRDVWMasked512", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPSHUFB128", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSHUFB256", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSHUFB512", argLength: 2, reg: w21, asm: "VPSHUFB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSHUFBMasked128", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSHUFBMasked256", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSHUFBMasked512", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSIGNB128", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSIGNB256", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSIGND128", argLength: 2, reg: v21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false}, @@ -1141,6 +1145,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, + {name: "VPSHUFD128", argLength: 1, reg: v11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSHUFD256", argLength: 1, reg: v11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSHUFD512", argLength: 1, reg: w11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSHUFDMasked256", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSHUFDMasked512", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSHUFHW128", argLength: 1, reg: w11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSHUFHW256", argLength: 1, reg: v11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSHUFHW512", argLength: 1, reg: w11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSHUFDMasked128", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPROLD128", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPROLD256", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 498c693e3c..774fb5cce7 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -726,6 +726,10 @@ func simdGenericOps() []opData { {name: "PermuteFloat32x16", argLength: 2, commutative: false}, {name: "PermuteFloat64x4", argLength: 2, commutative: false}, {name: "PermuteFloat64x8", argLength: 2, commutative: false}, + {name: "PermuteGroupedInt8x32", argLength: 2, commutative: false}, + {name: "PermuteGroupedInt8x64", argLength: 2, commutative: false}, + {name: "PermuteGroupedUint8x32", argLength: 2, commutative: false}, + {name: "PermuteGroupedUint8x64", argLength: 2, commutative: false}, {name: "PermuteInt8x16", argLength: 2, commutative: false}, {name: "PermuteInt8x32", argLength: 2, commutative: false}, {name: "PermuteInt8x64", argLength: 2, commutative: false}, @@ -1089,6 +1093,28 @@ func simdGenericOps() []opData { {name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 9212b17a35..cb496a4244 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2039,7 +2039,11 @@ const ( OpAMD64VPSHRDVWMasked256 OpAMD64VPSHRDVWMasked512 OpAMD64VPSHUFB128 + OpAMD64VPSHUFB256 + OpAMD64VPSHUFB512 OpAMD64VPSHUFBMasked128 + OpAMD64VPSHUFBMasked256 + OpAMD64VPSHUFBMasked512 OpAMD64VPSIGNB128 OpAMD64VPSIGNB256 OpAMD64VPSIGND128 @@ -2364,6 +2368,18 @@ const ( OpAMD64VPCMPW512 OpAMD64VPCMPD512 OpAMD64VPCMPQ512 + OpAMD64VPSHUFD128 + OpAMD64VPSHUFD256 + OpAMD64VPSHUFD512 + OpAMD64VPSHUFDMasked256 + OpAMD64VPSHUFDMasked512 + OpAMD64VPSHUFHW128 + OpAMD64VPSHUFHW256 + OpAMD64VPSHUFHW512 + OpAMD64VPSHUFHWMasked256 + OpAMD64VPSHUFHWMasked512 + OpAMD64VPSHUFHWMasked128 + OpAMD64VPSHUFDMasked128 OpAMD64VPROLD128 OpAMD64VPROLD256 OpAMD64VPROLD512 @@ -5505,6 +5521,10 @@ const ( OpPermuteFloat32x16 OpPermuteFloat64x4 OpPermuteFloat64x8 + OpPermuteGroupedInt8x32 + OpPermuteGroupedInt8x64 + OpPermuteGroupedUint8x32 + OpPermuteGroupedUint8x64 OpPermuteInt8x16 OpPermuteInt8x32 OpPermuteInt8x64 @@ -5868,6 +5888,28 @@ const ( OpGetElemUint16x8 OpGetElemUint32x4 OpGetElemUint64x2 + OpPermuteConstantGroupedInt32x8 + OpPermuteConstantGroupedInt32x16 + OpPermuteConstantGroupedUint32x8 + OpPermuteConstantGroupedUint32x16 + OpPermuteConstantHiGroupedInt16x16 + OpPermuteConstantHiGroupedInt16x32 + OpPermuteConstantHiGroupedUint16x16 + OpPermuteConstantHiGroupedUint16x32 + OpPermuteConstantHiInt16x8 + OpPermuteConstantHiInt32x4 + OpPermuteConstantHiUint16x8 + OpPermuteConstantHiUint32x4 + OpPermuteConstantInt32x4 + OpPermuteConstantLoGroupedInt16x16 + OpPermuteConstantLoGroupedInt16x32 + OpPermuteConstantLoGroupedUint16x16 + OpPermuteConstantLoGroupedUint16x32 + OpPermuteConstantLoInt16x8 + OpPermuteConstantLoInt32x4 + OpPermuteConstantLoUint16x8 + OpPermuteConstantLoUint32x4 + OpPermuteConstantUint32x4 OpRotateAllLeftInt32x4 OpRotateAllLeftInt32x8 OpRotateAllLeftInt32x16 @@ -31031,6 +31073,34 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPSHUFB256", + argLen: 2, + asm: x86.AVPSHUFB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFB512", + argLen: 2, + asm: x86.AVPSHUFB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPSHUFBMasked128", argLen: 3, @@ -31046,6 +31116,36 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPSHUFBMasked256", + argLen: 3, + asm: x86.AVPSHUFB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFBMasked512", + argLen: 3, + asm: x86.AVPSHUFB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPSIGNB128", argLen: 2, @@ -35810,6 +35910,180 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPSHUFD128", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFD256", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFD512", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFDMasked256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFDMasked512", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFHW128", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFHW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFHW256", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFHW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFHW512", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFHW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFHWMasked256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFHW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFHWMasked512", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFHW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFHWMasked128", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFHW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFDMasked128", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPROLD128", auxType: auxUInt8, @@ -69053,6 +69327,26 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "PermuteGroupedInt8x32", + argLen: 2, + generic: true, + }, + { + name: "PermuteGroupedInt8x64", + argLen: 2, + generic: true, + }, + { + name: "PermuteGroupedUint8x32", + argLen: 2, + generic: true, + }, + { + name: "PermuteGroupedUint8x64", + argLen: 2, + generic: true, + }, { name: "PermuteInt8x16", argLen: 2, @@ -70932,6 +71226,138 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "PermuteConstantGroupedInt32x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantGroupedInt32x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantGroupedUint32x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantGroupedUint32x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantHiGroupedInt16x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantHiGroupedInt16x32", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantHiGroupedUint16x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantHiGroupedUint16x32", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantHiInt16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantHiInt32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantHiUint16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantHiUint32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantInt32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantLoGroupedInt16x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantLoGroupedInt16x32", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantLoGroupedUint16x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantLoGroupedUint16x32", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantLoInt16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantLoInt32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantLoUint16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantLoUint32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "PermuteConstantUint32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, { name: "RotateAllLeftInt32x4", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index e31b5f981f..77ae32519a 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -3223,6 +3223,72 @@ func rewriteValueAMD64(v *Value) bool { case OpPermute2Uint8x64: v.Op = OpAMD64VPERMI2B512 return true + case OpPermuteConstantGroupedInt32x16: + v.Op = OpAMD64VPSHUFD512 + return true + case OpPermuteConstantGroupedInt32x8: + v.Op = OpAMD64VPSHUFD256 + return true + case OpPermuteConstantGroupedUint32x16: + v.Op = OpAMD64VPSHUFD512 + return true + case OpPermuteConstantGroupedUint32x8: + v.Op = OpAMD64VPSHUFD256 + return true + case OpPermuteConstantHiGroupedInt16x16: + v.Op = OpAMD64VPSHUFHW256 + return true + case OpPermuteConstantHiGroupedInt16x32: + v.Op = OpAMD64VPSHUFHW512 + return true + case OpPermuteConstantHiGroupedUint16x16: + v.Op = OpAMD64VPSHUFHW256 + return true + case OpPermuteConstantHiGroupedUint16x32: + v.Op = OpAMD64VPSHUFHW512 + return true + case OpPermuteConstantHiInt16x8: + v.Op = OpAMD64VPSHUFHW128 + return true + case OpPermuteConstantHiInt32x4: + v.Op = OpAMD64VPSHUFHW128 + return true + case OpPermuteConstantHiUint16x8: + v.Op = OpAMD64VPSHUFHW128 + return true + case OpPermuteConstantHiUint32x4: + v.Op = OpAMD64VPSHUFHW128 + return true + case OpPermuteConstantInt32x4: + v.Op = OpAMD64VPSHUFD128 + return true + case OpPermuteConstantLoGroupedInt16x16: + v.Op = OpAMD64VPSHUFHW256 + return true + case OpPermuteConstantLoGroupedInt16x32: + v.Op = OpAMD64VPSHUFHW512 + return true + case OpPermuteConstantLoGroupedUint16x16: + v.Op = OpAMD64VPSHUFHW256 + return true + case OpPermuteConstantLoGroupedUint16x32: + v.Op = OpAMD64VPSHUFHW512 + return true + case OpPermuteConstantLoInt16x8: + v.Op = OpAMD64VPSHUFHW128 + return true + case OpPermuteConstantLoInt32x4: + v.Op = OpAMD64VPSHUFHW128 + return true + case OpPermuteConstantLoUint16x8: + v.Op = OpAMD64VPSHUFHW128 + return true + case OpPermuteConstantLoUint32x4: + v.Op = OpAMD64VPSHUFHW128 + return true + case OpPermuteConstantUint32x4: + v.Op = OpAMD64VPSHUFD128 + return true case OpPermuteFloat32x16: v.Op = OpAMD64VPERMPS512 return true @@ -3235,6 +3301,18 @@ func rewriteValueAMD64(v *Value) bool { case OpPermuteFloat64x8: v.Op = OpAMD64VPERMPD512 return true + case OpPermuteGroupedInt8x32: + v.Op = OpAMD64VPSHUFB256 + return true + case OpPermuteGroupedInt8x64: + v.Op = OpAMD64VPSHUFB512 + return true + case OpPermuteGroupedUint8x32: + v.Op = OpAMD64VPSHUFB256 + return true + case OpPermuteGroupedUint8x64: + v.Op = OpAMD64VPSHUFB512 + return true case OpPermuteInt16x16: v.Op = OpAMD64VPERMW256 return true @@ -26618,6 +26696,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool { v.AddArg4(x, y, z, mask) return true } + // match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) + // result: (VPSHUFHWMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFHW512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFHWMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU16Masked512 (VPERMW512 x y) mask) // result: (VPERMWMasked512 x y mask) for { @@ -27311,6 +27403,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg4(x, y, z, mask) return true } + // match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) + // result: (VPSHUFDMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFD512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFDMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask) // result: (VPERMPSMasked512 x y mask) for { @@ -28610,6 +28716,19 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool { v.AddArg4(x, y, z, mask) return true } + // match: (VMOVDQU8Masked512 (VPSHUFB512 x y) mask) + // result: (VPSHUFBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSHUFB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFBMasked512) + v.AddArg3(x, y, mask) + return true + } // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask) // result: (VPERMBMasked512 x y mask) for { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 1c2b22a7fe..4ce329e1a4 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -794,6 +794,32 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64) diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index a576829e8f..556562b51a 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -74,4 +74,32 @@ commutative: false documentation: !string |- // NAME copies element zero of its (128-bit) input to all elements of - // the 512-bit output vector. \ No newline at end of file + // the 512-bit output vector. +- go: PermuteGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using indices: +- go: PermuteConstant + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a permutation of vector x using constant indices: +- go: PermuteConstantGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using constant indices: +- go: PermuteConstantLo + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a permutation of vector x using constant indices: +- go: PermuteConstantLoGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using constant indices: +- go: PermuteConstantHi + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a permutation of vector x using constant indices: +- go: PermuteConstantHiGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using constant indices: \ No newline at end of file diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 3cdb9efe27..3d471ec480 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -432,4 +432,98 @@ go: $t name: indices out: - - *128any \ No newline at end of file + - *128any +- go: PermuteGrouped + asm: VPSHUFB + addDoc: !string |- + // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} + // Only the needed bits to represent the index of a group of x are used in indices' elements. + // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. + // Each group is of size 128-bit. + in: + - &256Or512any + bits: "256|512" + go: $t + - bits: "256|512" + go: $t + name: indices + out: + - *256Or512any + +- go: PermuteConstant + asm: VPSHUFD + addDoc: !string |- + // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + in: + - *128any + - class: immediate + immOffset: 0 + name: indices + out: + - *128any +- go: PermuteConstantGrouped + asm: VPSHUFD + addDoc: !string |- + // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // Each group is of size 128-bit. + in: + - *256Or512any + - class: immediate + immOffset: 0 + name: indices + out: + - *256Or512any + +- go: PermuteConstantLo + asm: VPSHUFHW + addDoc: !string |- + // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + in: + - *128any + - class: immediate + immOffset: 0 + name: indices + out: + - *128any +- go: PermuteConstantLoGrouped + asm: VPSHUFHW + addDoc: !string |- + // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // Each group is of size 128-bit. + in: + - *256Or512any + - class: immediate + immOffset: 0 + name: indices + out: + - *256Or512any + +- go: PermuteConstantHi + asm: VPSHUFHW + addDoc: !string |- + // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + in: + - *128any + - class: immediate + immOffset: 0 + name: indices + out: + - *128any +- go: PermuteConstantHiGrouped + asm: VPSHUFHW + addDoc: !string |- + // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} + // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // Each group is of size 128-bit. + in: + - *256Or512any + - class: immediate + immOffset: 0 + name: indices + out: + - *256Or512any \ No newline at end of file diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index e0e580bd27..e600f7c1a0 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -4564,6 +4564,266 @@ func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8 // Asm: VPERMI2Q, CPU Feature: AVX512 func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8 +/* PermuteConstant */ + +// PermuteConstant performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX +func (x Int32x4) PermuteConstant(indices uint8) Int32x4 + +// PermuteConstant performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX +func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4 + +/* PermuteConstantGrouped */ + +// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX2 +func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8 + +// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX512 +func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16 + +// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX2 +func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8 + +// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX512 +func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16 + +/* PermuteConstantHi */ + +// PermuteConstantHi performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8 + +// PermuteConstantHi performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX +func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4 + +// PermuteConstantHi performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8 + +// PermuteConstantHi performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX +func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4 + +/* PermuteConstantHiGrouped */ + +// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16 + +// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32 + +// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16 + +// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32 + +/* PermuteConstantLo */ + +// PermuteConstantLo performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8 + +// PermuteConstantLo performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX +func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4 + +// PermuteConstantLo performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8 + +// PermuteConstantLo performs a permutation of vector x using constant indices: +// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX +func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4 + +/* PermuteConstantLoGrouped */ + +// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16 + +// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32 + +// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16 + +// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: +// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32 + +/* PermuteGrouped */ + +// PermuteGrouped performs a grouped permutation of vector x using indices: +// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// Only the needed bits to represent the index of a group of x are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// Each group is of size 128-bit. +// +// Asm: VPSHUFB, CPU Feature: AVX2 +func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32 + +// PermuteGrouped performs a grouped permutation of vector x using indices: +// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// Only the needed bits to represent the index of a group of x are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// Each group is of size 128-bit. +// +// Asm: VPSHUFB, CPU Feature: AVX512 +func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64 + +// PermuteGrouped performs a grouped permutation of vector x using indices: +// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// Only the needed bits to represent the index of a group of x are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// Each group is of size 128-bit. +// +// Asm: VPSHUFB, CPU Feature: AVX2 +func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32 + +// PermuteGrouped performs a grouped permutation of vector x using indices: +// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// Only the needed bits to represent the index of a group of x are used in indices' elements. +// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// Each group is of size 128-bit. +// +// Asm: VPSHUFB, CPU Feature: AVX512 +func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64 + /* Reciprocal */ // Reciprocal computes an approximate reciprocal of each element. -- 2.52.0