ssa.OpAMD64VPRORVQMasked128,
ssa.OpAMD64VPRORVQMasked256,
ssa.OpAMD64VPRORVQMasked512,
- ssa.OpAMD64VPACKSSDWMasked128,
ssa.OpAMD64VPACKSSDWMasked256,
ssa.OpAMD64VPACKSSDWMasked512,
- ssa.OpAMD64VPACKUSDWMasked128,
+ ssa.OpAMD64VPACKSSDWMasked128,
ssa.OpAMD64VPACKUSDWMasked256,
ssa.OpAMD64VPACKUSDWMasked512,
+ ssa.OpAMD64VPACKUSDWMasked128,
ssa.OpAMD64VSCALEFPSMasked128,
ssa.OpAMD64VSCALEFPSMasked256,
ssa.OpAMD64VSCALEFPSMasked512,
ssa.OpAMD64VPRORVQMasked128Merging,
ssa.OpAMD64VPRORVQMasked256Merging,
ssa.OpAMD64VPRORVQMasked512Merging,
- ssa.OpAMD64VPACKSSDWMasked128Merging,
ssa.OpAMD64VPACKSSDWMasked256Merging,
ssa.OpAMD64VPACKSSDWMasked512Merging,
- ssa.OpAMD64VPACKUSDWMasked128Merging,
+ ssa.OpAMD64VPACKSSDWMasked128Merging,
ssa.OpAMD64VPACKUSDWMasked256Merging,
ssa.OpAMD64VPACKUSDWMasked512Merging,
+ ssa.OpAMD64VPACKUSDWMasked128Merging,
ssa.OpAMD64VSCALEFPSMasked128Merging,
ssa.OpAMD64VSCALEFPSMasked256Merging,
ssa.OpAMD64VSCALEFPSMasked512Merging,
ssa.OpAMD64VPRORVQMasked128load,
ssa.OpAMD64VPRORVQMasked256load,
ssa.OpAMD64VPRORVQMasked512load,
- ssa.OpAMD64VPACKSSDWMasked128load,
ssa.OpAMD64VPACKSSDWMasked256load,
ssa.OpAMD64VPACKSSDWMasked512load,
- ssa.OpAMD64VPACKUSDWMasked128load,
+ ssa.OpAMD64VPACKSSDWMasked128load,
ssa.OpAMD64VPACKUSDWMasked256load,
ssa.OpAMD64VPACKUSDWMasked512load,
+ ssa.OpAMD64VPACKUSDWMasked128load,
ssa.OpAMD64VSCALEFPSMasked128load,
ssa.OpAMD64VSCALEFPSMasked256load,
ssa.OpAMD64VSCALEFPSMasked512load,
ssa.OpAMD64VPMOVSQBMasked128_128,
ssa.OpAMD64VPMOVSQBMasked128_256,
ssa.OpAMD64VPMOVSQBMasked128_512,
- ssa.OpAMD64VPACKSSDWMasked128,
- ssa.OpAMD64VPACKSSDWMasked128load,
ssa.OpAMD64VPACKSSDWMasked256,
ssa.OpAMD64VPACKSSDWMasked256load,
ssa.OpAMD64VPACKSSDWMasked512,
ssa.OpAMD64VPACKSSDWMasked512load,
+ ssa.OpAMD64VPACKSSDWMasked128,
+ ssa.OpAMD64VPACKSSDWMasked128load,
ssa.OpAMD64VPMOVSDWMasked128_128,
ssa.OpAMD64VPMOVSDWMasked128_256,
ssa.OpAMD64VPMOVSDWMasked256,
ssa.OpAMD64VPMOVUSQBMasked128_128,
ssa.OpAMD64VPMOVUSQBMasked128_256,
ssa.OpAMD64VPMOVUSQBMasked128_512,
- ssa.OpAMD64VPACKUSDWMasked128,
- ssa.OpAMD64VPACKUSDWMasked128load,
ssa.OpAMD64VPACKUSDWMasked256,
ssa.OpAMD64VPACKUSDWMasked256load,
ssa.OpAMD64VPACKUSDWMasked512,
ssa.OpAMD64VPACKUSDWMasked512load,
+ ssa.OpAMD64VPACKUSDWMasked128,
+ ssa.OpAMD64VPACKUSDWMasked128load,
ssa.OpAMD64VPMOVUSDWMasked128_128,
ssa.OpAMD64VPMOVUSDWMasked128_256,
ssa.OpAMD64VPMOVUSDWMasked256,
(SaturateToInt16Int64x4 ...) => (VPMOVSQW128_256 ...)
(SaturateToInt16Int64x8 ...) => (VPMOVSQW128_512 ...)
(SaturateToInt16ConcatInt32x4 ...) => (VPACKSSDW128 ...)
-(SaturateToInt16ConcatInt32x8 ...) => (VPACKSSDW256 ...)
-(SaturateToInt16ConcatInt32x16 ...) => (VPACKSSDW512 ...)
+(SaturateToInt16ConcatGroupedInt32x8 ...) => (VPACKSSDW256 ...)
+(SaturateToInt16ConcatGroupedInt32x16 ...) => (VPACKSSDW512 ...)
(SaturateToInt32Int64x2 ...) => (VPMOVSQD128_128 ...)
(SaturateToInt32Int64x4 ...) => (VPMOVSQD128_256 ...)
(SaturateToInt32Int64x8 ...) => (VPMOVSQD256 ...)
(SaturateToUint16Uint64x2 ...) => (VPMOVUSQW128_128 ...)
(SaturateToUint16Uint64x4 ...) => (VPMOVUSQW128_256 ...)
(SaturateToUint16Uint64x8 ...) => (VPMOVUSQW128_512 ...)
-(SaturateToUint16ConcatUint32x4 ...) => (VPACKUSDW128 ...)
-(SaturateToUint16ConcatUint32x8 ...) => (VPACKUSDW256 ...)
-(SaturateToUint16ConcatUint32x16 ...) => (VPACKUSDW512 ...)
+(SaturateToUint16ConcatInt32x4 ...) => (VPACKUSDW128 ...)
+(SaturateToUint16ConcatGroupedInt32x8 ...) => (VPACKUSDW256 ...)
+(SaturateToUint16ConcatGroupedInt32x16 ...) => (VPACKUSDW512 ...)
(SaturateToUint32Uint64x2 ...) => (VPMOVUSQD128_128 ...)
(SaturateToUint32Uint64x4 ...) => (VPMOVUSQD128_256 ...)
(SaturateToUint32Uint64x8 ...) => (VPMOVUSQD256 ...)
(VMOVDQU64Masked128 (VPMOVSQB128_128 x) mask) => (VPMOVSQBMasked128_128 x mask)
(VMOVDQU64Masked256 (VPMOVSQB128_256 x) mask) => (VPMOVSQBMasked128_256 x mask)
(VMOVDQU64Masked512 (VPMOVSQB128_512 x) mask) => (VPMOVSQBMasked128_512 x mask)
-(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask)
(VMOVDQU32Masked256 (VPACKSSDW256 x y) mask) => (VPACKSSDWMasked256 x y mask)
(VMOVDQU32Masked512 (VPACKSSDW512 x y) mask) => (VPACKSSDWMasked512 x y mask)
+(VMOVDQU32Masked128 (VPACKSSDW128 x y) mask) => (VPACKSSDWMasked128 x y mask)
(VMOVDQU32Masked128 (VPMOVSDW128_128 x) mask) => (VPMOVSDWMasked128_128 x mask)
(VMOVDQU32Masked256 (VPMOVSDW128_256 x) mask) => (VPMOVSDWMasked128_256 x mask)
(VMOVDQU32Masked256 (VPMOVSDW256 x) mask) => (VPMOVSDWMasked256 x mask)
(VMOVDQU64Masked128 (VPMOVUSQB128_128 x) mask) => (VPMOVUSQBMasked128_128 x mask)
(VMOVDQU64Masked256 (VPMOVUSQB128_256 x) mask) => (VPMOVUSQBMasked128_256 x mask)
(VMOVDQU64Masked512 (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512 x mask)
-(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask)
(VMOVDQU32Masked256 (VPACKUSDW256 x y) mask) => (VPACKUSDWMasked256 x y mask)
(VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) => (VPACKUSDWMasked512 x y mask)
+(VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask)
(VMOVDQU32Masked128 (VPMOVUSDW128_128 x) mask) => (VPMOVUSDWMasked128_128 x mask)
(VMOVDQU32Masked256 (VPMOVUSDW128_256 x) mask) => (VPMOVUSDWMasked128_256 x mask)
(VMOVDQU32Masked256 (VPMOVUSDW256 x) mask) => (VPMOVUSDWMasked256 x mask)
(VPRORVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked256load {sym} [off] x ptr mask mem)
(VPRORVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORVQMasked512load {sym} [off] x ptr mask mem)
(VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
-(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
(VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
(VPACKSSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked512load {sym} [off] x ptr mask mem)
+(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
(VPACKUSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDW512load {sym} [off] x ptr mem)
-(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem)
(VPACKUSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked256load {sym} [off] x ptr mask mem)
(VPACKUSDWMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked512load {sym} [off] x ptr mask mem)
+(VPACKUSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKUSDWMasked128load {sym} [off] x ptr mask mem)
(VSCALEFPS128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS128load {sym} [off] x ptr mem)
(VSCALEFPS256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS256load {sym} [off] x ptr mem)
(VSCALEFPS512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPS512load {sym} [off] x ptr mem)
{name: "SaturateToInt8Int64x2", argLength: 1, commutative: false},
{name: "SaturateToInt8Int64x4", argLength: 1, commutative: false},
{name: "SaturateToInt8Int64x8", argLength: 1, commutative: false},
+ {name: "SaturateToInt16ConcatGroupedInt32x8", argLength: 2, commutative: false},
+ {name: "SaturateToInt16ConcatGroupedInt32x16", argLength: 2, commutative: false},
{name: "SaturateToInt16ConcatInt32x4", argLength: 2, commutative: false},
- {name: "SaturateToInt16ConcatInt32x8", argLength: 2, commutative: false},
- {name: "SaturateToInt16ConcatInt32x16", argLength: 2, commutative: false},
{name: "SaturateToInt16Int32x4", argLength: 1, commutative: false},
{name: "SaturateToInt16Int32x8", argLength: 1, commutative: false},
{name: "SaturateToInt16Int32x16", argLength: 1, commutative: false},
{name: "SaturateToUint8Uint64x2", argLength: 1, commutative: false},
{name: "SaturateToUint8Uint64x4", argLength: 1, commutative: false},
{name: "SaturateToUint8Uint64x8", argLength: 1, commutative: false},
- {name: "SaturateToUint16ConcatUint32x4", argLength: 2, commutative: false},
- {name: "SaturateToUint16ConcatUint32x8", argLength: 2, commutative: false},
- {name: "SaturateToUint16ConcatUint32x16", argLength: 2, commutative: false},
+ {name: "SaturateToUint16ConcatGroupedInt32x8", argLength: 2, commutative: false},
+ {name: "SaturateToUint16ConcatGroupedInt32x16", argLength: 2, commutative: false},
+ {name: "SaturateToUint16ConcatInt32x4", argLength: 2, commutative: false},
{name: "SaturateToUint16Uint32x4", argLength: 1, commutative: false},
{name: "SaturateToUint16Uint32x8", argLength: 1, commutative: false},
{name: "SaturateToUint16Uint32x16", argLength: 1, commutative: false},
OpSaturateToInt8Int64x2
OpSaturateToInt8Int64x4
OpSaturateToInt8Int64x8
+ OpSaturateToInt16ConcatGroupedInt32x8
+ OpSaturateToInt16ConcatGroupedInt32x16
OpSaturateToInt16ConcatInt32x4
- OpSaturateToInt16ConcatInt32x8
- OpSaturateToInt16ConcatInt32x16
OpSaturateToInt16Int32x4
OpSaturateToInt16Int32x8
OpSaturateToInt16Int32x16
OpSaturateToUint8Uint64x2
OpSaturateToUint8Uint64x4
OpSaturateToUint8Uint64x8
- OpSaturateToUint16ConcatUint32x4
- OpSaturateToUint16ConcatUint32x8
- OpSaturateToUint16ConcatUint32x16
+ OpSaturateToUint16ConcatGroupedInt32x8
+ OpSaturateToUint16ConcatGroupedInt32x16
+ OpSaturateToUint16ConcatInt32x4
OpSaturateToUint16Uint32x4
OpSaturateToUint16Uint32x8
OpSaturateToUint16Uint32x16
generic: true,
},
{
- name: "SaturateToInt16ConcatInt32x4",
+ name: "SaturateToInt16ConcatGroupedInt32x8",
argLen: 2,
generic: true,
},
{
- name: "SaturateToInt16ConcatInt32x8",
+ name: "SaturateToInt16ConcatGroupedInt32x16",
argLen: 2,
generic: true,
},
{
- name: "SaturateToInt16ConcatInt32x16",
+ name: "SaturateToInt16ConcatInt32x4",
argLen: 2,
generic: true,
},
generic: true,
},
{
- name: "SaturateToUint16ConcatUint32x4",
+ name: "SaturateToUint16ConcatGroupedInt32x8",
argLen: 2,
generic: true,
},
{
- name: "SaturateToUint16ConcatUint32x8",
+ name: "SaturateToUint16ConcatGroupedInt32x16",
argLen: 2,
generic: true,
},
{
- name: "SaturateToUint16ConcatUint32x16",
+ name: "SaturateToUint16ConcatInt32x4",
argLen: 2,
generic: true,
},
case OpSHA256TwoRoundsUint32x4:
v.Op = OpAMD64SHA256RNDS2128
return true
- case OpSaturateToInt16ConcatInt32x16:
+ case OpSaturateToInt16ConcatGroupedInt32x16:
v.Op = OpAMD64VPACKSSDW512
return true
+ case OpSaturateToInt16ConcatGroupedInt32x8:
+ v.Op = OpAMD64VPACKSSDW256
+ return true
case OpSaturateToInt16ConcatInt32x4:
v.Op = OpAMD64VPACKSSDW128
return true
- case OpSaturateToInt16ConcatInt32x8:
- v.Op = OpAMD64VPACKSSDW256
- return true
case OpSaturateToInt16Int32x16:
v.Op = OpAMD64VPMOVSDW256
return true
case OpSaturateToInt8Int64x8:
v.Op = OpAMD64VPMOVSQB128_512
return true
- case OpSaturateToUint16ConcatUint32x16:
+ case OpSaturateToUint16ConcatGroupedInt32x16:
v.Op = OpAMD64VPACKUSDW512
return true
- case OpSaturateToUint16ConcatUint32x4:
- v.Op = OpAMD64VPACKUSDW128
- return true
- case OpSaturateToUint16ConcatUint32x8:
+ case OpSaturateToUint16ConcatGroupedInt32x8:
v.Op = OpAMD64VPACKUSDW256
return true
+ case OpSaturateToUint16ConcatInt32x4:
+ v.Op = OpAMD64VPACKUSDW128
+ return true
case OpSaturateToUint16Uint32x16:
v.Op = OpAMD64VPMOVUSDW256
return true
addF(simdPackage, "Int64x4.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x8.SaturateToInt16", opLen1(ssa.OpSaturateToInt16Int64x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x8.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Int32x16.SaturateToInt16Concat", opLen2(ssa.OpSaturateToInt16ConcatInt32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x8.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.SaturateToInt16ConcatGrouped", opLen2(ssa.OpSaturateToInt16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x2.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x4.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x8.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x2.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x8.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint64x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x8.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x8, types.TypeVec256), sys.AMD64)
- addF(simdPackage, "Uint32x16.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatUint32x16, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Int32x4.SaturateToUint16Concat", opLen2(ssa.OpSaturateToUint16ConcatInt32x4, types.TypeVec128), sys.AMD64)
+ addF(simdPackage, "Int32x8.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x16.SaturateToUint16ConcatGrouped", opLen2(ssa.OpSaturateToUint16ConcatGroupedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x2.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x8.SaturateToUint32", opLen1(ssa.OpSaturateToUint32Uint64x8, types.TypeVec256), sys.AMD64)
outLanes := o.Out[0].Lanes
if inLanes != nil && outLanes != nil && *inLanes < *outLanes {
if (strings.Contains(o.Go, "Saturate") || strings.Contains(o.Go, "Truncate")) &&
- !strings.HasSuffix(o.Go, "Concat") {
+ !strings.Contains(o.Go, "Concat") {
o.Documentation += "\n// Results are packed to low elements in the returned vector, its upper elements are zeroed."
}
}
regexpTag: "convert"
documentation: !string |-
// NAME truncates element values to int16.
-- go: "SaturateToInt16(Concat)?"
+- go: "SaturateToInt16(Concat(Grouped)?)?"
commutative: false
regexpTag: "convert"
documentation: !string |-
regexpTag: "convert"
documentation: !string |-
// NAME truncates element values to uint16.
-- go: "SaturateToUint16(Concat)?"
+- go: "SaturateToUint16(Concat(Grouped)?)?"
commutative: false
regexpTag: "convert"
documentation: !string |-
regexpTag: "convert"
asm: "VPACKSSDW"
addDoc: &satDocConcat
+ !string |-
+ // The converted elements from x will be packed to the lower part of the result vector,
+ // the converted elements from y will be packed to the upper part of the result vector.
+ in:
+ - base: int
+ - base: int
+ out:
+ - base: int
+ bits: 128
+- go: SaturateToInt16ConcatGrouped
+ regexpTag: "convert"
+ asm: "VPACKSSDW"
+ addDoc: &satDocConcatGrouped
!string |-
// With each 128-bit as a group:
- // The converted group from the first input vector will be packed to the lower part of the result vector,
- // the converted group from the second input vector will be packed to the upper part of the result vector.
+ // The converted elements from x will be packed to the lower part of the group in the result vector,
+ // the converted elements from y will be packed to the upper part of the group in the result vector.
in:
- base: int
- base: int
out:
- base: int
+ bits: 256|512
- go: SaturateToUint16Concat
regexpTag: "convert"
asm: "VPACKUSDW"
+ addDoc: *satDocConcat
in:
+ - base: int
+ - base: int
+ out:
- base: uint
- - base: uint
+ bits: 128
+- go: SaturateToUint16ConcatGrouped
+ regexpTag: "convert"
+ asm: "VPACKUSDW"
+ addDoc: *satDocConcatGrouped
+ in:
+ - base: int
+ - base: int
out:
- base: uint
+ bits: 256|512
# low-part only conversions.
# uint8->uint16
/* SaturateToInt16Concat */
// SaturateToInt16Concat converts element values to int16 with signed saturation.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
+// The converted elements from x will be packed to the lower part of the result vector,
+// the converted elements from y will be packed to the upper part of the result vector.
//
// Asm: VPACKSSDW, CPU Feature: AVX
func (x Int32x4) SaturateToInt16Concat(y Int32x4) Int16x8
-// SaturateToInt16Concat converts element values to int16 with signed saturation.
+/* SaturateToInt16ConcatGrouped */
+
+// SaturateToInt16ConcatGrouped converts element values to int16 with signed saturation.
// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
//
// Asm: VPACKSSDW, CPU Feature: AVX2
-func (x Int32x8) SaturateToInt16Concat(y Int32x8) Int16x16
+func (x Int32x8) SaturateToInt16ConcatGrouped(y Int32x8) Int16x16
-// SaturateToInt16Concat converts element values to int16 with signed saturation.
+// SaturateToInt16ConcatGrouped converts element values to int16 with signed saturation.
// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
//
// Asm: VPACKSSDW, CPU Feature: AVX512
-func (x Int32x16) SaturateToInt16Concat(y Int32x16) Int16x32
+func (x Int32x16) SaturateToInt16ConcatGrouped(y Int32x16) Int16x32
/* SaturateToInt32 */
/* SaturateToUint16Concat */
// SaturateToUint16Concat converts element values to uint16 with unsigned saturation.
+// The converted elements from x will be packed to the lower part of the result vector,
+// the converted elements from y will be packed to the upper part of the result vector.
//
// Asm: VPACKUSDW, CPU Feature: AVX
-func (x Uint32x4) SaturateToUint16Concat(y Uint32x4) Uint16x8
+func (x Int32x4) SaturateToUint16Concat(y Int32x4) Uint16x8
-// SaturateToUint16Concat converts element values to uint16 with unsigned saturation.
+/* SaturateToUint16ConcatGrouped */
+
+// SaturateToUint16ConcatGrouped converts element values to uint16 with unsigned saturation.
+// With each 128-bit as a group:
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
//
// Asm: VPACKUSDW, CPU Feature: AVX2
-func (x Uint32x8) SaturateToUint16Concat(y Uint32x8) Uint16x16
+func (x Int32x8) SaturateToUint16ConcatGrouped(y Int32x8) Uint16x16
-// SaturateToUint16Concat converts element values to uint16 with unsigned saturation.
+// SaturateToUint16ConcatGrouped converts element values to uint16 with unsigned saturation.
+// With each 128-bit as a group:
+// The converted elements from x will be packed to the lower part of the group in the result vector,
+// the converted elements from y will be packed to the upper part of the group in the result vector.
//
// Asm: VPACKUSDW, CPU Feature: AVX512
-func (x Uint32x16) SaturateToUint16Concat(y Uint32x16) Uint16x32
+func (x Int32x16) SaturateToUint16ConcatGrouped(y Int32x16) Uint16x32
/* SaturateToUint32 */