From 42cda7c1dfcc1ab109766f2016efe2331b3d0aab Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Fri, 19 Dec 2025 15:21:15 -0500 Subject: [PATCH] simd/archsimd: add Grouped for 256- and 512-bit SaturateTo(U)Int16Concat, and fix type They operate on 128-bit groups, so name them Grouped to be clear, and consistent with other grouped operations. Reword the documentation, mention the grouping only for grouped versions. Also, SaturateToUnt16Concat(Grouped) is a signed int32 to unsigned uint16 saturated conversion. The receiver and the parameter should be signed. The result remains unsigned. Change-Id: I30e28bc05e07f5c28214c9c6d9d201cbbb183468 Reviewed-on: https://go-review.googlesource.com/c/go/+/731501 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/simdssa.go | 20 ++++----- .../compile/internal/ssa/_gen/simdAMD64.rules | 18 ++++---- .../internal/ssa/_gen/simdgenericOps.go | 10 ++--- src/cmd/compile/internal/ssa/opGen.go | 22 +++++----- src/cmd/compile/internal/ssa/rewriteAMD64.go | 18 ++++---- .../compile/internal/ssagen/simdintrinsics.go | 10 ++--- src/simd/archsimd/_gen/simdgen/godefs.go | 2 +- .../_gen/simdgen/ops/Converts/categories.yaml | 4 +- .../_gen/simdgen/ops/Converts/go.yaml | 32 ++++++++++++-- src/simd/archsimd/ops_amd64.go | 43 ++++++++++++------- 10 files changed, 108 insertions(+), 71 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 13353c75a9..454dbb3080 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -739,12 +739,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQMasked128, ssa.OpAMD64VPRORVQMasked256, ssa.OpAMD64VPRORVQMasked512, - ssa.OpAMD64VPACKSSDWMasked128, ssa.OpAMD64VPACKSSDWMasked256, ssa.OpAMD64VPACKSSDWMasked512, - ssa.OpAMD64VPACKUSDWMasked128, + ssa.OpAMD64VPACKSSDWMasked128, ssa.OpAMD64VPACKUSDWMasked256, ssa.OpAMD64VPACKUSDWMasked512, + ssa.OpAMD64VPACKUSDWMasked128, ssa.OpAMD64VSCALEFPSMasked128, ssa.OpAMD64VSCALEFPSMasked256, ssa.OpAMD64VSCALEFPSMasked512, @@ -1575,12 +1575,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQMasked128Merging, ssa.OpAMD64VPRORVQMasked256Merging, ssa.OpAMD64VPRORVQMasked512Merging, - ssa.OpAMD64VPACKSSDWMasked128Merging, ssa.OpAMD64VPACKSSDWMasked256Merging, ssa.OpAMD64VPACKSSDWMasked512Merging, - ssa.OpAMD64VPACKUSDWMasked128Merging, + ssa.OpAMD64VPACKSSDWMasked128Merging, ssa.OpAMD64VPACKUSDWMasked256Merging, ssa.OpAMD64VPACKUSDWMasked512Merging, + ssa.OpAMD64VPACKUSDWMasked128Merging, ssa.OpAMD64VSCALEFPSMasked128Merging, ssa.OpAMD64VSCALEFPSMasked256Merging, ssa.OpAMD64VSCALEFPSMasked512Merging, @@ -2162,12 +2162,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQMasked128load, ssa.OpAMD64VPRORVQMasked256load, ssa.OpAMD64VPRORVQMasked512load, - ssa.OpAMD64VPACKSSDWMasked128load, ssa.OpAMD64VPACKSSDWMasked256load, ssa.OpAMD64VPACKSSDWMasked512load, - ssa.OpAMD64VPACKUSDWMasked128load, + ssa.OpAMD64VPACKSSDWMasked128load, ssa.OpAMD64VPACKUSDWMasked256load, ssa.OpAMD64VPACKUSDWMasked512load, + ssa.OpAMD64VPACKUSDWMasked128load, ssa.OpAMD64VSCALEFPSMasked128load, ssa.OpAMD64VSCALEFPSMasked256load, ssa.OpAMD64VSCALEFPSMasked512load, @@ -3439,12 +3439,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVSQBMasked128_128, ssa.OpAMD64VPMOVSQBMasked128_256, ssa.OpAMD64VPMOVSQBMasked128_512, - ssa.OpAMD64VPACKSSDWMasked128, - ssa.OpAMD64VPACKSSDWMasked128load, ssa.OpAMD64VPACKSSDWMasked256, ssa.OpAMD64VPACKSSDWMasked256load, ssa.OpAMD64VPACKSSDWMasked512, ssa.OpAMD64VPACKSSDWMasked512load, + ssa.OpAMD64VPACKSSDWMasked128, + ssa.OpAMD64VPACKSSDWMasked128load, ssa.OpAMD64VPMOVSDWMasked128_128, ssa.OpAMD64VPMOVSDWMasked128_256, ssa.OpAMD64VPMOVSDWMasked256, @@ -3463,12 +3463,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVUSQBMasked128_128, ssa.OpAMD64VPMOVUSQBMasked128_256, ssa.OpAMD64VPMOVUSQBMasked128_512, - ssa.OpAMD64VPACKUSDWMasked128, - ssa.OpAMD64VPACKUSDWMasked128load, ssa.OpAMD64VPACKUSDWMasked256, ssa.OpAMD64VPACKUSDWMasked256load, ssa.OpAMD64VPACKUSDWMasked512, ssa.OpAMD64VPACKUSDWMasked512load, + ssa.OpAMD64VPACKUSDWMasked128, + ssa.OpAMD64VPACKUSDWMasked128load, ssa.OpAMD64VPMOVUSDWMasked128_128, ssa.OpAMD64VPMOVUSDWMasked128_256, ssa.OpAMD64VPMOVUSDWMasked256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 39d4f9b850..6b1cac322c 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -914,8 +914,8 @@ (SaturateToInt16Int64x4 ...) => (VPMOVSQW128_256 ...) (SaturateToInt16Int64x8 ...) => (VPMOVSQW128_512 ...) (SaturateToInt16ConcatInt32x4 ...) => (VPACKSSDW128 ...) -(SaturateToInt16ConcatInt32x8 ...) => (VPACKSSDW256 ...) -(SaturateToInt16ConcatInt32x16 ...) => (VPACKSSDW512 ...) +(SaturateToInt16ConcatGroupedInt32x8 ...) => (VPACKSSDW256 ...) +(SaturateToInt16ConcatGroupedInt32x16 ...) => (VPACKSSDW512 ...) (SaturateToInt32Int64x2 ...) => (VPMOVSQD128_128 ...) (SaturateToInt32Int64x4 ...) => (VPMOVSQD128_256 ...) (SaturateToInt32Int64x8 ...) => (VPMOVSQD256 ...) @@ -934,9 +934,9 @@ (SaturateToUint16Uint64x2 ...) => (VPMOVUSQW128_128 ...) (SaturateToUint16Uint64x4 ...) => (VPMOVUSQW128_256 ...) (SaturateToUint16Uint64x8 ...) => (VPMOVUSQW128_512 ...) -(SaturateToUint16ConcatUint32x4 ...) => (VPACKUSDW128 ...) -(SaturateToUint16ConcatUint32x8 ...) => (VPACKUSDW256 ...) -(SaturateToUint16ConcatUint32x16 ...) => (VPACKUSDW512 ...) +(SaturateToUint16ConcatInt32x4 ...) => (VPACKUSDW128 ...) +(SaturateToUint16ConcatGroupedInt32x8 ...) => (VPACKUSDW256 ...) +(SaturateToUint16ConcatGroupedInt32x16 ...) => (VPACKUSDW512 ...) (SaturateToUint32Uint64x2 ...) => (VPMOVUSQD128_128 ...) (SaturateToUint32Uint64x4 ...) => (VPMOVUSQD128_256 ...) (SaturateToUint32Uint64x8 ...) => (VPMOVUSQD256 ...) @@ -1775,9 +1775,9 @@ (VMOVDQU64Masked128 (VPMOVSQB128_128 x) mask) => (VPMOVSQBMasked128_128 x mask) (VMOVDQU64Masked256 (VPMOVSQB128_256 x) mask) => (VPMOVSQBMasked128_256 x mask) (VMOVDQU64Masked512 (VPMOVSQB128_512 x) mask) => (VPMOVSQBMasked128_512 x mask) -(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask) (VMOVDQU32Masked256 (VPACKSSDW256 x y) mask) => (VPACKSSDWMasked256 x y mask) (VMOVDQU32Masked512 (VPACKSSDW512 x y) mask) => (VPACKSSDWMasked512 x y mask) +(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask) (VMOVDQU32Masked128 (VPMOVSDW128_128 x) mask) => (VPMOVSDWMasked128_128 x mask) (VMOVDQU32Masked256 (VPMOVSDW128_256 x) mask) => (VPMOVSDWMasked128_256 x mask) (VMOVDQU32Masked256 (VPMOVSDW256 x) mask) => (VPMOVSDWMasked256 x mask) @@ -1796,9 +1796,9 @@ (VMOVDQU64Masked128 (VPMOVUSQB128_128 x) mask) => (VPMOVUSQBMasked128_128 x mask) (VMOVDQU64Masked256 (VPMOVUSQB128_256 x) mask) => (VPMOVUSQBMasked128_256 x mask) (VMOVDQU64Masked512 (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512 x mask) -(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask) (VMOVDQU32Masked256 (VPACKUSDW256 x y) mask) => (VPACKUSDWMasked256 x y mask) (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) => (VPACKUSDWMasked512 x y mask) +(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask) (VMOVDQU32Masked128 (VPMOVUSDW128_128 x) mask) => (VPMOVUSDWMasked128_128 x mask) (VMOVDQU32Masked256 (VPMOVUSDW128_256 x) mask) => (VPMOVUSDWMasked128_256 x mask) (VMOVDQU32Masked256 (VPMOVUSDW256 x) mask) => (VPMOVUSDWMasked256 x mask) @@ -2948,13 +2948,13 @@ (VPRORVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked256load {sym} [off] x ptr mask mem) (VPRORVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked512load {sym} [off] x ptr mask mem) (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem) -(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem) (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem) (VPACKSSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked512load {sym} [off] x ptr mask mem) +(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem) (VPACKUSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDW512load {sym} [off] x ptr mem) -(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem) (VPACKUSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked256load {sym} [off] x ptr mask mem) (VPACKUSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked512load {sym} [off] x ptr mask mem) +(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem) (VSCALEFPS128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS128load {sym} [off] x ptr mem) (VSCALEFPS256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS256load {sym} [off] x ptr mem) (VSCALEFPS512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS512load {sym} [off] x ptr mem) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 36f3703bf1..07878e2c69 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -830,9 +830,9 @@ func simdGenericOps() []opData { {name: "SaturateToInt8Int64x2", argLength: 1, commutative: false}, {name: "SaturateToInt8Int64x4", argLength: 1, commutative: false}, {name: "SaturateToInt8Int64x8", argLength: 1, commutative: false}, + {name: "SaturateToInt16ConcatGroupedInt32x8", argLength: 2, commutative: false}, + {name: "SaturateToInt16ConcatGroupedInt32x16", argLength: 2, commutative: false}, {name: "SaturateToInt16ConcatInt32x4", argLength: 2, commutative: false}, - {name: "SaturateToInt16ConcatInt32x8", argLength: 2, commutative: false}, - {name: "SaturateToInt16ConcatInt32x16", argLength: 2, commutative: false}, {name: "SaturateToInt16Int32x4", argLength: 1, commutative: false}, {name: "SaturateToInt16Int32x8", argLength: 1, commutative: false}, {name: "SaturateToInt16Int32x16", argLength: 1, commutative: false}, @@ -851,9 +851,9 @@ func simdGenericOps() []opData { {name: "SaturateToUint8Uint64x2", argLength: 1, commutative: false}, {name: "SaturateToUint8Uint64x4", argLength: 1, commutative: false}, {name: "SaturateToUint8Uint64x8", argLength: 1, commutative: false}, - {name: "SaturateToUint16ConcatUint32x4", argLength: 2, commutative: false}, - {name: "SaturateToUint16ConcatUint32x8", argLength: 2, commutative: false}, - {name: "SaturateToUint16ConcatUint32x16", argLength: 2, commutative: false}, + {name: "SaturateToUint16ConcatGroupedInt32x8", argLength: 2, commutative: false}, + {name: "SaturateToUint16ConcatGroupedInt32x16", argLength: 2, commutative: false}, + {name: "SaturateToUint16ConcatInt32x4", argLength: 2, commutative: false}, {name: "SaturateToUint16Uint32x4", argLength: 1, commutative: false}, {name: "SaturateToUint16Uint32x8", argLength: 1, commutative: false}, {name: "SaturateToUint16Uint32x16", argLength: 1, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 71ad2c2a9a..ab7ca8de0d 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -7004,9 +7004,9 @@ const ( OpSaturateToInt8Int64x2 OpSaturateToInt8Int64x4 OpSaturateToInt8Int64x8 + OpSaturateToInt16ConcatGroupedInt32x8 + OpSaturateToInt16ConcatGroupedInt32x16 OpSaturateToInt16ConcatInt32x4 - OpSaturateToInt16ConcatInt32x8 - OpSaturateToInt16ConcatInt32x16 OpSaturateToInt16Int32x4 OpSaturateToInt16Int32x8 OpSaturateToInt16Int32x16 @@ -7025,9 +7025,9 @@ const ( OpSaturateToUint8Uint64x2 OpSaturateToUint8Uint64x4 OpSaturateToUint8Uint64x8 - OpSaturateToUint16ConcatUint32x4 - OpSaturateToUint16ConcatUint32x8 - OpSaturateToUint16ConcatUint32x16 + OpSaturateToUint16ConcatGroupedInt32x8 + OpSaturateToUint16ConcatGroupedInt32x16 + OpSaturateToUint16ConcatInt32x4 OpSaturateToUint16Uint32x4 OpSaturateToUint16Uint32x8 OpSaturateToUint16Uint32x16 @@ -93738,17 +93738,17 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "SaturateToInt16ConcatInt32x4", + name: "SaturateToInt16ConcatGroupedInt32x8", argLen: 2, generic: true, }, { - name: "SaturateToInt16ConcatInt32x8", + name: "SaturateToInt16ConcatGroupedInt32x16", argLen: 2, generic: true, }, { - name: "SaturateToInt16ConcatInt32x16", + name: "SaturateToInt16ConcatInt32x4", argLen: 2, generic: true, }, @@ -93843,17 +93843,17 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "SaturateToUint16ConcatUint32x4", + name: "SaturateToUint16ConcatGroupedInt32x8", argLen: 2, generic: true, }, { - name: "SaturateToUint16ConcatUint32x8", + name: "SaturateToUint16ConcatGroupedInt32x16", argLen: 2, generic: true, }, { - name: "SaturateToUint16ConcatUint32x16", + name: "SaturateToUint16ConcatInt32x4", argLen: 2, generic: true, }, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 9efc566c48..a0f4f6a704 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -5040,15 +5040,15 @@ func rewriteValueAMD64(v *Value) bool { case OpSHA256TwoRoundsUint32x4: v.Op = OpAMD64SHA256RNDS2128 return true - case OpSaturateToInt16ConcatInt32x16: + case OpSaturateToInt16ConcatGroupedInt32x16: v.Op = OpAMD64VPACKSSDW512 return true + case OpSaturateToInt16ConcatGroupedInt32x8: + v.Op = OpAMD64VPACKSSDW256 + return true case OpSaturateToInt16ConcatInt32x4: v.Op = OpAMD64VPACKSSDW128 return true - case OpSaturateToInt16ConcatInt32x8: - v.Op = OpAMD64VPACKSSDW256 - return true case OpSaturateToInt16Int32x16: v.Op = OpAMD64VPMOVSDW256 return true @@ -5103,15 +5103,15 @@ func rewriteValueAMD64(v *Value) bool { case OpSaturateToInt8Int64x8: v.Op = OpAMD64VPMOVSQB128_512 return true - case OpSaturateToUint16ConcatUint32x16: + case OpSaturateToUint16ConcatGroupedInt32x16: v.Op = OpAMD64VPACKUSDW512 return true - case OpSaturateToUint16ConcatUint32x4: - v.Op = OpAMD64VPACKUSDW128 - return true - case OpSaturateToUint16ConcatUint32x8: + case OpSaturateToUint16ConcatGroupedInt32x8: v.Op = OpAMD64VPACKUSDW256 return true + case OpSaturateToUint16ConcatInt32x4: + v.Op = OpAMD64VPACKUSDW128 + return true case OpSaturateToUint16Uint32x16: v.Op = OpAMD64VPMOVUSDW256 return true diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 22cf50d491..e1d7ac796d 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -926,8 +926,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x4.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x8.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x4.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x8.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x2.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x4.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x8.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x8, types.TypeVec256), sys.AMD64) @@ -946,9 +946,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x8.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x16.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x2.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x8.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x8, types.TypeVec256), sys.AMD64) diff --git a/src/simd/archsimd/_gen/simdgen/godefs.go b/src/simd/archsimd/_gen/simdgen/godefs.go index 3ac74264e8..71cae158f7 100644 --- a/src/simd/archsimd/_gen/simdgen/godefs.go +++ b/src/simd/archsimd/_gen/simdgen/godefs.go @@ -142,7 +142,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error { outLanes := o.Out[0].Lanes if inLanes != nil && outLanes != nil && *inLanes < *outLanes { if (strings.Contains(o.Go, "Saturate") || strings.Contains(o.Go, "Truncate")) && - !strings.HasSuffix(o.Go, "Concat") { + !strings.Contains(o.Go, "Concat") { o.Documentation += "\n// Results are packed to low elements in the returned vector, its upper elements are zeroed." } } diff --git a/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml index 1e2a6a9b69..88e4840920 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml @@ -64,7 +64,7 @@ regexpTag: "convert" documentation: !string |- // NAME truncates element values to int16. -- go: "SaturateToInt16(Concat)?" +- go: "SaturateToInt16(Concat(Grouped)?)?" commutative: false regexpTag: "convert" documentation: !string |- @@ -109,7 +109,7 @@ regexpTag: "convert" documentation: !string |- // NAME truncates element values to uint16. -- go: "SaturateToUint16(Concat)?" +- go: "SaturateToUint16(Concat(Grouped)?)?" commutative: false regexpTag: "convert" documentation: !string |- diff --git a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml index 1d688b434d..f436be23ef 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml @@ -445,23 +445,49 @@ regexpTag: "convert" asm: "VPACKSSDW" addDoc: &satDocConcat + !string |- + // The converted elements from x will be packed to the lower part of the result vector, + // the converted elements from y will be packed to the upper part of the result vector. + in: + - base: int + - base: int + out: + - base: int + bits: 128 +- go: SaturateToInt16ConcatGrouped + regexpTag: "convert" + asm: "VPACKSSDW" + addDoc: &satDocConcatGrouped !string |- // With each 128-bit as a group: - // The converted group from the first input vector will be packed to the lower part of the result vector, - // the converted group from the second input vector will be packed to the upper part of the result vector. + // The converted elements from x will be packed to the lower part of the group in the result vector, + // the converted elements from y will be packed to the upper part of the group in the result vector. in: - base: int - base: int out: - base: int + bits: 256|512 - go: SaturateToUint16Concat regexpTag: "convert" asm: "VPACKUSDW" + addDoc: *satDocConcat in: + - base: int + - base: int + out: - base: uint - - base: uint + bits: 128 +- go: SaturateToUint16ConcatGrouped + regexpTag: "convert" + asm: "VPACKUSDW" + addDoc: *satDocConcatGrouped + in: + - base: int + - base: int out: - base: uint + bits: 256|512 # low-part only conversions. # uint8->uint16 diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go index 304c0c0796..2a8a6bd4c6 100644 --- a/src/simd/archsimd/ops_amd64.go +++ b/src/simd/archsimd/ops_amd64.go @@ -5418,28 +5418,29 @@ func (x Int64x8) SaturateToInt16() Int16x8 /* SaturateToInt16Concat */ // SaturateToInt16Concat converts element values to int16 with signed saturation. -// With each 128-bit as a group: -// The converted group from the first input vector will be packed to the lower part of the result vector, -// the converted group from the second input vector will be packed to the upper part of the result vector. +// The converted elements from x will be packed to the lower part of the result vector, +// the converted elements from y will be packed to the upper part of the result vector. // // Asm: VPACKSSDW, CPU Feature: AVX func (x Int32x4) SaturateToInt16Concat(y Int32x4) Int16x8 -// SaturateToInt16Concat converts element values to int16 with signed saturation. +/* SaturateToInt16ConcatGrouped */ + +// SaturateToInt16ConcatGrouped converts element values to int16 with signed saturation. // With each 128-bit as a group: -// The converted group from the first input vector will be packed to the lower part of the result vector, -// the converted group from the second input vector will be packed to the upper part of the result vector. +// The converted elements from x will be packed to the lower part of the group in the result vector, +// the converted elements from y will be packed to the upper part of the group in the result vector. // // Asm: VPACKSSDW, CPU Feature: AVX2 -func (x Int32x8) SaturateToInt16Concat(y Int32x8) Int16x16 +func (x Int32x8) SaturateToInt16ConcatGrouped(y Int32x8) Int16x16 -// SaturateToInt16Concat converts element values to int16 with signed saturation. +// SaturateToInt16ConcatGrouped converts element values to int16 with signed saturation. // With each 128-bit as a group: -// The converted group from the first input vector will be packed to the lower part of the result vector, -// the converted group from the second input vector will be packed to the upper part of the result vector. +// The converted elements from x will be packed to the lower part of the group in the result vector, +// the converted elements from y will be packed to the upper part of the group in the result vector. // // Asm: VPACKSSDW, CPU Feature: AVX512 -func (x Int32x16) SaturateToInt16Concat(y Int32x16) Int16x32 +func (x Int32x16) SaturateToInt16ConcatGrouped(y Int32x16) Int16x32 /* SaturateToInt32 */ @@ -5550,19 +5551,29 @@ func (x Uint64x8) SaturateToUint16() Uint16x8 /* SaturateToUint16Concat */ // SaturateToUint16Concat converts element values to uint16 with unsigned saturation. +// The converted elements from x will be packed to the lower part of the result vector, +// the converted elements from y will be packed to the upper part of the result vector. // // Asm: VPACKUSDW, CPU Feature: AVX -func (x Uint32x4) SaturateToUint16Concat(y Uint32x4) Uint16x8 +func (x Int32x4) SaturateToUint16Concat(y Int32x4) Uint16x8 -// SaturateToUint16Concat converts element values to uint16 with unsigned saturation. +/* SaturateToUint16ConcatGrouped */ + +// SaturateToUint16ConcatGrouped converts element values to uint16 with unsigned saturation. +// With each 128-bit as a group: +// The converted elements from x will be packed to the lower part of the group in the result vector, +// the converted elements from y will be packed to the upper part of the group in the result vector. // // Asm: VPACKUSDW, CPU Feature: AVX2 -func (x Uint32x8) SaturateToUint16Concat(y Uint32x8) Uint16x16 +func (x Int32x8) SaturateToUint16ConcatGrouped(y Int32x8) Uint16x16 -// SaturateToUint16Concat converts element values to uint16 with unsigned saturation. +// SaturateToUint16ConcatGrouped converts element values to uint16 with unsigned saturation. +// With each 128-bit as a group: +// The converted elements from x will be packed to the lower part of the group in the result vector, +// the converted elements from y will be packed to the upper part of the group in the result vector. // // Asm: VPACKUSDW, CPU Feature: AVX512 -func (x Uint32x16) SaturateToUint16Concat(y Uint32x16) Uint16x32 +func (x Int32x16) SaturateToUint16ConcatGrouped(y Int32x16) Uint16x32 /* SaturateToUint32 */ -- 2.52.0