simd/archsimd: add Grouped for 256- and 512-bit SaturateTo(U)Int16Concat, and fix...

author Cherry Mui <cherryyz@google.com>

Fri, 19 Dec 2025 20:21:15 +0000 (15:21 -0500)

committer Cherry Mui <cherryyz@google.com>

Fri, 19 Dec 2025 22:39:26 +0000 (14:39 -0800)
author Cherry Mui <cherryyz@google.com>
Fri, 19 Dec 2025 20:21:15 +0000 (15:21 -0500)
committer Cherry Mui <cherryyz@google.com>
Fri, 19 Dec 2025 22:39:26 +0000 (14:39 -0800)
diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go

index 13353c75a9855c8d866e6b702bf04841ebad85ec..454dbb308090a8fbef637689a1d62f1789173fbd 100644 (file)
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -739,12 +739,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORVQMasked128,
                 ssa.OpAMD64VPRORVQMasked256,
                 ssa.OpAMD64VPRORVQMasked512,
-               ssa.OpAMD64VPACKSSDWMasked128,
                 ssa.OpAMD64VPACKSSDWMasked256,
                 ssa.OpAMD64VPACKSSDWMasked512,
-               ssa.OpAMD64VPACKUSDWMasked128,
+               ssa.OpAMD64VPACKSSDWMasked128,
                 ssa.OpAMD64VPACKUSDWMasked256,
                 ssa.OpAMD64VPACKUSDWMasked512,
+               ssa.OpAMD64VPACKUSDWMasked128,
                 ssa.OpAMD64VSCALEFPSMasked128,
                 ssa.OpAMD64VSCALEFPSMasked256,
                 ssa.OpAMD64VSCALEFPSMasked512,
@@ -1575,12 +1575,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORVQMasked128Merging,
                 ssa.OpAMD64VPRORVQMasked256Merging,
                 ssa.OpAMD64VPRORVQMasked512Merging,
-               ssa.OpAMD64VPACKSSDWMasked128Merging,
                 ssa.OpAMD64VPACKSSDWMasked256Merging,
                 ssa.OpAMD64VPACKSSDWMasked512Merging,
-               ssa.OpAMD64VPACKUSDWMasked128Merging,
+               ssa.OpAMD64VPACKSSDWMasked128Merging,
                 ssa.OpAMD64VPACKUSDWMasked256Merging,
                 ssa.OpAMD64VPACKUSDWMasked512Merging,
+               ssa.OpAMD64VPACKUSDWMasked128Merging,
                 ssa.OpAMD64VSCALEFPSMasked128Merging,
                 ssa.OpAMD64VSCALEFPSMasked256Merging,
                 ssa.OpAMD64VSCALEFPSMasked512Merging,
@@ -2162,12 +2162,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPRORVQMasked128load,
                 ssa.OpAMD64VPRORVQMasked256load,
                 ssa.OpAMD64VPRORVQMasked512load,
-               ssa.OpAMD64VPACKSSDWMasked128load,
                 ssa.OpAMD64VPACKSSDWMasked256load,
                 ssa.OpAMD64VPACKSSDWMasked512load,
-               ssa.OpAMD64VPACKUSDWMasked128load,
+               ssa.OpAMD64VPACKSSDWMasked128load,
                 ssa.OpAMD64VPACKUSDWMasked256load,
                 ssa.OpAMD64VPACKUSDWMasked512load,
+               ssa.OpAMD64VPACKUSDWMasked128load,
                 ssa.OpAMD64VSCALEFPSMasked128load,
                 ssa.OpAMD64VSCALEFPSMasked256load,
                 ssa.OpAMD64VSCALEFPSMasked512load,
@@ -3439,12 +3439,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMOVSQBMasked128_128,
                 ssa.OpAMD64VPMOVSQBMasked128_256,
                 ssa.OpAMD64VPMOVSQBMasked128_512,
-               ssa.OpAMD64VPACKSSDWMasked128,
-               ssa.OpAMD64VPACKSSDWMasked128load,
                 ssa.OpAMD64VPACKSSDWMasked256,
                 ssa.OpAMD64VPACKSSDWMasked256load,
                 ssa.OpAMD64VPACKSSDWMasked512,
                 ssa.OpAMD64VPACKSSDWMasked512load,
+               ssa.OpAMD64VPACKSSDWMasked128,
+               ssa.OpAMD64VPACKSSDWMasked128load,
                 ssa.OpAMD64VPMOVSDWMasked128_128,
                 ssa.OpAMD64VPMOVSDWMasked128_256,
                 ssa.OpAMD64VPMOVSDWMasked256,
@@ -3463,12 +3463,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                 ssa.OpAMD64VPMOVUSQBMasked128_128,
                 ssa.OpAMD64VPMOVUSQBMasked128_256,
                 ssa.OpAMD64VPMOVUSQBMasked128_512,
-               ssa.OpAMD64VPACKUSDWMasked128,
-               ssa.OpAMD64VPACKUSDWMasked128load,
                 ssa.OpAMD64VPACKUSDWMasked256,
                 ssa.OpAMD64VPACKUSDWMasked256load,
                 ssa.OpAMD64VPACKUSDWMasked512,
                 ssa.OpAMD64VPACKUSDWMasked512load,
+               ssa.OpAMD64VPACKUSDWMasked128,
+               ssa.OpAMD64VPACKUSDWMasked128load,
                 ssa.OpAMD64VPMOVUSDWMasked128_128,
                 ssa.OpAMD64VPMOVUSDWMasked128_256,
                 ssa.OpAMD64VPMOVUSDWMasked256,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules

index 39d4f9b8509cdf96ef8a54c4e75c77d4575ac645..6b1cac322cdaedf118f325a3e03618928006d0df 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -914,8 +914,8 @@
  (SaturateToInt16Int64x4 ...) => (VPMOVSQW128_256 ...)
  (SaturateToInt16Int64x8 ...) => (VPMOVSQW128_512 ...)
  (SaturateToInt16ConcatInt32x4 ...) => (VPACKSSDW128 ...)
-(SaturateToInt16ConcatInt32x8 ...) => (VPACKSSDW256 ...)
-(SaturateToInt16ConcatInt32x16 ...) => (VPACKSSDW512 ...)
+(SaturateToInt16ConcatGroupedInt32x8 ...) => (VPACKSSDW256 ...)
+(SaturateToInt16ConcatGroupedInt32x16 ...) => (VPACKSSDW512 ...)
  (SaturateToInt32Int64x2 ...) => (VPMOVSQD128_128 ...)
  (SaturateToInt32Int64x4 ...) => (VPMOVSQD128_256 ...)
  (SaturateToInt32Int64x8 ...) => (VPMOVSQD256 ...)
@@ -934,9 +934,9 @@
  (SaturateToUint16Uint64x2 ...) => (VPMOVUSQW128_128 ...)
  (SaturateToUint16Uint64x4 ...) => (VPMOVUSQW128_256 ...)
  (SaturateToUint16Uint64x8 ...) => (VPMOVUSQW128_512 ...)
-(SaturateToUint16ConcatUint32x4 ...) => (VPACKUSDW128 ...)
-(SaturateToUint16ConcatUint32x8 ...) => (VPACKUSDW256 ...)
-(SaturateToUint16ConcatUint32x16 ...) => (VPACKUSDW512 ...)
+(SaturateToUint16ConcatInt32x4 ...) => (VPACKUSDW128 ...)
+(SaturateToUint16ConcatGroupedInt32x8 ...) => (VPACKUSDW256 ...)
+(SaturateToUint16ConcatGroupedInt32x16 ...) => (VPACKUSDW512 ...)
  (SaturateToUint32Uint64x2 ...) => (VPMOVUSQD128_128 ...)
  (SaturateToUint32Uint64x4 ...) => (VPMOVUSQD128_256 ...)
  (SaturateToUint32Uint64x8 ...) => (VPMOVUSQD256 ...)
@@ -1775,9 +1775,9 @@
  (VMOVDQU64Masked128 (VPMOVSQB128_128 x) mask) => (VPMOVSQBMasked128_128 x mask)
  (VMOVDQU64Masked256 (VPMOVSQB128_256 x) mask) => (VPMOVSQBMasked128_256 x mask)
  (VMOVDQU64Masked512 (VPMOVSQB128_512 x) mask) => (VPMOVSQBMasked128_512 x mask)
-(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask)
  (VMOVDQU32Masked256 (VPACKSSDW256 x y) mask) => (VPACKSSDWMasked256 x y mask)
  (VMOVDQU32Masked512 (VPACKSSDW512 x y) mask) => (VPACKSSDWMasked512 x y mask)
+(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask)
  (VMOVDQU32Masked128 (VPMOVSDW128_128 x) mask) => (VPMOVSDWMasked128_128 x mask)
  (VMOVDQU32Masked256 (VPMOVSDW128_256 x) mask) => (VPMOVSDWMasked128_256 x mask)
  (VMOVDQU32Masked256 (VPMOVSDW256 x) mask) => (VPMOVSDWMasked256 x mask)
@@ -1796,9 +1796,9 @@
  (VMOVDQU64Masked128 (VPMOVUSQB128_128 x) mask) => (VPMOVUSQBMasked128_128 x mask)
  (VMOVDQU64Masked256 (VPMOVUSQB128_256 x) mask) => (VPMOVUSQBMasked128_256 x mask)
  (VMOVDQU64Masked512 (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512 x mask)
-(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask)
  (VMOVDQU32Masked256 (VPACKUSDW256 x y) mask) => (VPACKUSDWMasked256 x y mask)
  (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) => (VPACKUSDWMasked512 x y mask)
+(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask)
  (VMOVDQU32Masked128 (VPMOVUSDW128_128 x) mask) => (VPMOVUSDWMasked128_128 x mask)
  (VMOVDQU32Masked256 (VPMOVUSDW128_256 x) mask) => (VPMOVUSDWMasked128_256 x mask)
  (VMOVDQU32Masked256 (VPMOVUSDW256 x) mask) => (VPMOVUSDWMasked256 x mask)
@@ -2948,13 +2948,13 @@
  (VPRORVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked256load {sym} [off] x ptr mask mem)
  (VPRORVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked512load {sym} [off] x ptr mask mem)
  (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
-(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
  (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
  (VPACKSSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked512load {sym} [off] x ptr mask mem)
+(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
  (VPACKUSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDW512load {sym} [off] x ptr mem)
-(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem)
  (VPACKUSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked256load {sym} [off] x ptr mask mem)
  (VPACKUSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked512load {sym} [off] x ptr mask mem)
+(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem)
  (VSCALEFPS128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS128load {sym} [off] x ptr mem)
  (VSCALEFPS256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS256load {sym} [off] x ptr mem)
  (VSCALEFPS512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS512load {sym} [off] x ptr mem)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go

index 36f3703bf159cce527619f6122a36c39fa2f221f..07878e2c69356988f9a0f7c0b71d470708c6fd6f 100644 (file)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -830,9 +830,9 @@ func simdGenericOps() []opData {
                 {name: "SaturateToInt8Int64x2", argLength: 1, commutative: false},
                 {name: "SaturateToInt8Int64x4", argLength: 1, commutative: false},
                 {name: "SaturateToInt8Int64x8", argLength: 1, commutative: false},
+               {name: "SaturateToInt16ConcatGroupedInt32x8", argLength: 2, commutative: false},
+               {name: "SaturateToInt16ConcatGroupedInt32x16", argLength: 2, commutative: false},
                 {name: "SaturateToInt16ConcatInt32x4", argLength: 2, commutative: false},
-               {name: "SaturateToInt16ConcatInt32x8", argLength: 2, commutative: false},
-               {name: "SaturateToInt16ConcatInt32x16", argLength: 2, commutative: false},
                 {name: "SaturateToInt16Int32x4", argLength: 1, commutative: false},
                 {name: "SaturateToInt16Int32x8", argLength: 1, commutative: false},
                 {name: "SaturateToInt16Int32x16", argLength: 1, commutative: false},
@@ -851,9 +851,9 @@ func simdGenericOps() []opData {
                 {name: "SaturateToUint8Uint64x2", argLength: 1, commutative: false},
                 {name: "SaturateToUint8Uint64x4", argLength: 1, commutative: false},
                 {name: "SaturateToUint8Uint64x8", argLength: 1, commutative: false},
-               {name: "SaturateToUint16ConcatUint32x4", argLength: 2, commutative: false},
-               {name: "SaturateToUint16ConcatUint32x8", argLength: 2, commutative: false},
-               {name: "SaturateToUint16ConcatUint32x16", argLength: 2, commutative: false},
+               {name: "SaturateToUint16ConcatGroupedInt32x8", argLength: 2, commutative: false},
+               {name: "SaturateToUint16ConcatGroupedInt32x16", argLength: 2, commutative: false},
+               {name: "SaturateToUint16ConcatInt32x4", argLength: 2, commutative: false},
                 {name: "SaturateToUint16Uint32x4", argLength: 1, commutative: false},
                 {name: "SaturateToUint16Uint32x8", argLength: 1, commutative: false},
                 {name: "SaturateToUint16Uint32x16", argLength: 1, commutative: false},
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go

index 71ad2c2a9a2c7a863586fa2f63dbbf4bcb8f7f18..ab7ca8de0dea89c7f5f4881f61da0bc83943beb6 100644 (file)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -7004,9 +7004,9 @@ const (
         OpSaturateToInt8Int64x2
         OpSaturateToInt8Int64x4
         OpSaturateToInt8Int64x8
+       OpSaturateToInt16ConcatGroupedInt32x8
+       OpSaturateToInt16ConcatGroupedInt32x16
         OpSaturateToInt16ConcatInt32x4
-       OpSaturateToInt16ConcatInt32x8
-       OpSaturateToInt16ConcatInt32x16
         OpSaturateToInt16Int32x4
         OpSaturateToInt16Int32x8
         OpSaturateToInt16Int32x16
@@ -7025,9 +7025,9 @@ const (
         OpSaturateToUint8Uint64x2
         OpSaturateToUint8Uint64x4
         OpSaturateToUint8Uint64x8
-       OpSaturateToUint16ConcatUint32x4
-       OpSaturateToUint16ConcatUint32x8
-       OpSaturateToUint16ConcatUint32x16
+       OpSaturateToUint16ConcatGroupedInt32x8
+       OpSaturateToUint16ConcatGroupedInt32x16
+       OpSaturateToUint16ConcatInt32x4
         OpSaturateToUint16Uint32x4
         OpSaturateToUint16Uint32x8
         OpSaturateToUint16Uint32x16
@@ -93738,17 +93738,17 @@ var opcodeTable = [...]opInfo{
                 generic: true,
         },
         {
-               name:    "SaturateToInt16ConcatInt32x4",
+               name:    "SaturateToInt16ConcatGroupedInt32x8",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturateToInt16ConcatInt32x8",
+               name:    "SaturateToInt16ConcatGroupedInt32x16",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturateToInt16ConcatInt32x16",
+               name:    "SaturateToInt16ConcatInt32x4",
                 argLen:  2,
                 generic: true,
         },
@@ -93843,17 +93843,17 @@ var opcodeTable = [...]opInfo{
                 generic: true,
         },
         {
-               name:    "SaturateToUint16ConcatUint32x4",
+               name:    "SaturateToUint16ConcatGroupedInt32x8",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturateToUint16ConcatUint32x8",
+               name:    "SaturateToUint16ConcatGroupedInt32x16",
                 argLen:  2,
                 generic: true,
         },
         {
-               name:    "SaturateToUint16ConcatUint32x16",
+               name:    "SaturateToUint16ConcatInt32x4",
                 argLen:  2,
                 generic: true,
         },
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go

index 9efc566c4821370bb8a71adc9192cfb601709b11..a0f4f6a704dc9e1216d5d5f12f2afc928715d72d 100644 (file)
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -5040,15 +5040,15 @@ func rewriteValueAMD64(v *Value) bool {
         case OpSHA256TwoRoundsUint32x4:
                 v.Op = OpAMD64SHA256RNDS2128
                 return true
-       case OpSaturateToInt16ConcatInt32x16:
+       case OpSaturateToInt16ConcatGroupedInt32x16:
                 v.Op = OpAMD64VPACKSSDW512
                 return true
+       case OpSaturateToInt16ConcatGroupedInt32x8:
+               v.Op = OpAMD64VPACKSSDW256
+               return true
         case OpSaturateToInt16ConcatInt32x4:
                 v.Op = OpAMD64VPACKSSDW128
                 return true
-       case OpSaturateToInt16ConcatInt32x8:
-               v.Op = OpAMD64VPACKSSDW256
-               return true
         case OpSaturateToInt16Int32x16:
                 v.Op = OpAMD64VPMOVSDW256
                 return true
@@ -5103,15 +5103,15 @@ func rewriteValueAMD64(v *Value) bool {
         case OpSaturateToInt8Int64x8:
                 v.Op = OpAMD64VPMOVSQB128_512
                 return true
-       case OpSaturateToUint16ConcatUint32x16:
+       case OpSaturateToUint16ConcatGroupedInt32x16:
                 v.Op = OpAMD64VPACKUSDW512
                 return true
-       case OpSaturateToUint16ConcatUint32x4:
-               v.Op = OpAMD64VPACKUSDW128
-               return true
-       case OpSaturateToUint16ConcatUint32x8:
+       case OpSaturateToUint16ConcatGroupedInt32x8:
                 v.Op = OpAMD64VPACKUSDW256
                 return true
+       case OpSaturateToUint16ConcatInt32x4:
+               v.Op = OpAMD64VPACKUSDW128
+               return true
         case OpSaturateToUint16Uint32x16:
                 v.Op = OpAMD64VPMOVUSDW256
                 return true
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go

index 22cf50d491c9759c3b0f5a7269341160861ce548..e1d7ac796dea058885ee07c6c5caa77b78c45ddd 100644 (file)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -926,8 +926,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Int64x4.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int64x8.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x8, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int32x4.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Int32x8.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Int32x16.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x8.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x16.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Int64x2.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int64x4.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Int64x8.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x8, types.TypeVec256), sys.AMD64)
@@ -946,9 +946,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
         addF(simdPackage, "Uint64x2.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint64x4.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint64x8.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x8, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x4, types.TypeVec128), sys.AMD64)
-       addF(simdPackage, "Uint32x8.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x8, types.TypeVec256), sys.AMD64)
-       addF(simdPackage, "Uint32x16.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x16.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64)
         addF(simdPackage, "Uint64x2.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x2, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint64x4.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x4, types.TypeVec128), sys.AMD64)
         addF(simdPackage, "Uint64x8.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x8, types.TypeVec256), sys.AMD64)
diff --git a/src/simd/archsimd/_gen/simdgen/godefs.go b/src/simd/archsimd/_gen/simdgen/godefs.go

index 3ac74264e8333518f6aac6c50889e4ef859738be..71cae158f770fc40bf8b8ceecfbe297ef03f19e4 100644 (file)
--- a/src/simd/archsimd/_gen/simdgen/godefs.go
+++ b/src/simd/archsimd/_gen/simdgen/godefs.go
@@ -142,7 +142,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
                 outLanes := o.Out[0].Lanes
                 if inLanes != nil && outLanes != nil && *inLanes < *outLanes {
                         if (strings.Contains(o.Go, "Saturate") || strings.Contains(o.Go, "Truncate")) &&
-                               !strings.HasSuffix(o.Go, "Concat") {
+                               !strings.Contains(o.Go, "Concat") {
                                 o.Documentation += "\n// Results are packed to low elements in the returned vector, its upper elements are zeroed."
                         }
                 }
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml

index 1e2a6a9b694de9330f69b03dc8b31f5ca5e1ba3d..88e4840920c0deea31a331b4fc24cee26b9bde33 100644 (file)
--- a/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml
@@ -64,7 +64,7 @@
    regexpTag: "convert"
    documentation: !string |-
      // NAME truncates element values to int16.
-- go: "SaturateToInt16(Concat)?"
+- go: "SaturateToInt16(Concat(Grouped)?)?"
    commutative: false
    regexpTag: "convert"
    documentation: !string |-
@@ -109,7 +109,7 @@
    regexpTag: "convert"
    documentation: !string |-
      // NAME truncates element values to uint16.
-- go: "SaturateToUint16(Concat)?"
+- go: "SaturateToUint16(Concat(Grouped)?)?"
    commutative: false
    regexpTag: "convert"
    documentation: !string |-
diff --git a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml

index 1d688b434d1e6d04a6943a05b705aad198e74617..f436be23efabb21ac42747754f56c6eb970dbc32 100644 (file)
--- a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml
+++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml
@@ -445,23 +445,49 @@
    regexpTag: "convert"
    asm: "VPACKSSDW"
    addDoc: &satDocConcat
+    !string |-
+    // The converted elements from x will be packed to the lower part of the result vector,
+    // the converted elements from y will be packed to the upper part of the result vector.
+  in:
+    - base: int
+    - base: int
+  out:
+    - base: int
+      bits: 128
+- go: SaturateToInt16ConcatGrouped
+  regexpTag: "convert"
+  asm: "VPACKSSDW"
+  addDoc: &satDocConcatGrouped
      !string |-
      // With each 128-bit as a group:
-    // The converted group from the first input vector will be packed to the lower part of the result vector,
-    // the converted group from the second input vector will be packed to the upper part of the result vector.
+    // The converted elements from x will be packed to the lower part of the group in the result vector,
+    // the converted elements from y will be packed to the upper part of the group in the result vector.
    in:
      - base: int
      - base: int
    out:
      - base: int
+      bits: 256|512
  - go: SaturateToUint16Concat
    regexpTag: "convert"
    asm: "VPACKUSDW"
+  addDoc: *satDocConcat
    in:
+    - base: int
+    - base: int
+  out:
      - base: uint
-    - base: uint
+      bits: 128
+- go: SaturateToUint16ConcatGrouped
+  regexpTag: "convert"
+  asm: "VPACKUSDW"
+  addDoc: *satDocConcatGrouped
+  in:
+    - base: int
+    - base: int
    out:
      - base: uint
+      bits: 256|512
  
  # low-part only conversions.
  # uint8->uint16
diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go

index 304c0c07967fc7eaa6ad7817952f015d409f7c4a..2a8a6bd4c616c0a74db94c701dce0a80431c378c 100644 (file)
--- a/src/simd/archsimd/ops_amd64.go
+++ b/src/simd/archsimd/ops_amd64.go
@@ -5418,28 +5418,29 @@ func (x Int64x8) SaturateToInt16() Int16x8
  /* SaturateToInt16Concat */
  
  // SaturateToInt16Concat converts element values to int16 with signed saturation.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
+// The converted elements from x will be packed to the lower part of the result vector,
+// the converted elements from y will be packed to the upper part of the result vector.
  //
  // Asm: VPACKSSDW, CPU Feature: AVX
  func (x Int32x4) SaturateToInt16Concat(y Int32x4) Int16x8
  
-// SaturateToInt16Concat converts element values to int16 with signed saturation.
+/* SaturateToInt16ConcatGrouped */
+
+// SaturateToInt16ConcatGrouped converts element values to int16 with signed saturation.
  // With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
  //
  // Asm: VPACKSSDW, CPU Feature: AVX2
-func (x Int32x8) SaturateToInt16Concat(y Int32x8) Int16x16
+func (x Int32x8) SaturateToInt16ConcatGrouped(y Int32x8) Int16x16
  
-// SaturateToInt16Concat converts element values to int16 with signed saturation.
+// SaturateToInt16ConcatGrouped converts element values to int16 with signed saturation.
  // With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
  //
  // Asm: VPACKSSDW, CPU Feature: AVX512
-func (x Int32x16) SaturateToInt16Concat(y Int32x16) Int16x32
+func (x Int32x16) SaturateToInt16ConcatGrouped(y Int32x16) Int16x32
  
  /* SaturateToInt32 */
  
@@ -5550,19 +5551,29 @@ func (x Uint64x8) SaturateToUint16() Uint16x8
  /* SaturateToUint16Concat */
  
  // SaturateToUint16Concat converts element values to uint16 with unsigned saturation.
+// The converted elements from x will be packed to the lower part of the result vector,
+// the converted elements from y will be packed to the upper part of the result vector.
  //
  // Asm: VPACKUSDW, CPU Feature: AVX
-func (x Uint32x4) SaturateToUint16Concat(y Uint32x4) Uint16x8
+func (x Int32x4) SaturateToUint16Concat(y Int32x4) Uint16x8
  
-// SaturateToUint16Concat converts element values to uint16 with unsigned saturation.
+/* SaturateToUint16ConcatGrouped */
+
+// SaturateToUint16ConcatGrouped converts element values to uint16 with unsigned saturation.
+// With each 128-bit as a group:
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
  //
  // Asm: VPACKUSDW, CPU Feature: AVX2
-func (x Uint32x8) SaturateToUint16Concat(y Uint32x8) Uint16x16
+func (x Int32x8) SaturateToUint16ConcatGrouped(y Int32x8) Uint16x16
  
-// SaturateToUint16Concat converts element values to uint16 with unsigned saturation.
+// SaturateToUint16ConcatGrouped converts element values to uint16 with unsigned saturation.
+// With each 128-bit as a group:
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
  //
  // Asm: VPACKUSDW, CPU Feature: AVX512
-func (x Uint32x16) SaturateToUint16Concat(y Uint32x16) Uint16x32
+func (x Int32x16) SaturateToUint16ConcatGrouped(y Int32x16) Uint16x32
  
  /* SaturateToUint32 */
author	Cherry Mui <cherryyz@google.com>
	Fri, 19 Dec 2025 20:21:15 +0000 (15:21 -0500)
committer	Cherry Mui <cherryyz@google.com>
	Fri, 19 Dec 2025 22:39:26 +0000 (14:39 -0800)
src/cmd/compile/internal/amd64/simdssa.go		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules		patch \| blob \| history
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go		patch \| blob \| history
src/cmd/compile/internal/ssa/opGen.go		patch \| blob \| history
src/cmd/compile/internal/ssa/rewriteAMD64.go		patch \| blob \| history
src/cmd/compile/internal/ssagen/simdintrinsics.go		patch \| blob \| history
src/simd/archsimd/_gen/simdgen/godefs.go		patch \| blob \| history
src/simd/archsimd/_gen/simdgen/ops/Converts/categories.yaml		patch \| blob \| history
src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml		patch \| blob \| history
src/simd/archsimd/ops_amd64.go		patch \| blob \| history