From: Cherry Mui Date: Mon, 22 Dec 2025 19:52:57 +0000 (-0500) Subject: simd/archsimd: correct documentation for pairwise operations X-Git-Tag: go1.26rc2~7^2~49 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=c1efada1d20a0f4af6ffc2be17706713af11b3b0;p=gostls13.git simd/archsimd: correct documentation for pairwise operations For Add/SubPairs(Saturated?), the documented result element order is wrong. Corrected. Also, for 256-bit vectors, this is a grouped operation. So name it with the Grouped suffix to be clear. Change-Id: Idfd0975cb4a332b2e28c898613861205d26f75b0 Reviewed-on: https://go-review.googlesource.com/c/go/+/732020 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index e3fc2fb380..8ef6e5c7f4 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -250,12 +250,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPADDQ256, ssa.OpAMD64VPADDQ512, ssa.OpAMD64VHADDPS128, - ssa.OpAMD64VHADDPS256, ssa.OpAMD64VHADDPD128, - ssa.OpAMD64VHADDPD256, ssa.OpAMD64VPHADDW128, - ssa.OpAMD64VPHADDW256, ssa.OpAMD64VPHADDD128, + ssa.OpAMD64VHADDPS256, + ssa.OpAMD64VHADDPD256, + ssa.OpAMD64VPHADDW256, ssa.OpAMD64VPHADDD256, ssa.OpAMD64VPHADDSW128, ssa.OpAMD64VPHADDSW256, @@ -520,12 +520,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSUBQ256, ssa.OpAMD64VPSUBQ512, ssa.OpAMD64VHSUBPS128, - ssa.OpAMD64VHSUBPS256, ssa.OpAMD64VHSUBPD128, - ssa.OpAMD64VHSUBPD256, ssa.OpAMD64VPHSUBW128, - ssa.OpAMD64VPHSUBW256, ssa.OpAMD64VPHSUBD128, + ssa.OpAMD64VHSUBPS256, + ssa.OpAMD64VHSUBPD256, + ssa.OpAMD64VPHSUBW256, ssa.OpAMD64VPHSUBD256, ssa.OpAMD64VPHSUBSW128, ssa.OpAMD64VPHSUBSW256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 793cf5c97f..dd8f9f91b3 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -57,19 +57,19 @@ (AddUint64x4 ...) => (VPADDQ256 ...) (AddUint64x8 ...) => (VPADDQ512 ...) (AddPairsFloat32x4 ...) => (VHADDPS128 ...) -(AddPairsFloat32x8 ...) => (VHADDPS256 ...) (AddPairsFloat64x2 ...) => (VHADDPD128 ...) -(AddPairsFloat64x4 ...) => (VHADDPD256 ...) (AddPairsInt16x8 ...) => (VPHADDW128 ...) -(AddPairsInt16x16 ...) => (VPHADDW256 ...) (AddPairsInt32x4 ...) => (VPHADDD128 ...) -(AddPairsInt32x8 ...) => (VPHADDD256 ...) (AddPairsUint16x8 ...) => (VPHADDW128 ...) -(AddPairsUint16x16 ...) => (VPHADDW256 ...) (AddPairsUint32x4 ...) => (VPHADDD128 ...) -(AddPairsUint32x8 ...) => (VPHADDD256 ...) +(AddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...) +(AddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...) +(AddPairsGroupedInt16x16 ...) => (VPHADDW256 ...) +(AddPairsGroupedInt32x8 ...) => (VPHADDD256 ...) +(AddPairsGroupedUint16x16 ...) => (VPHADDW256 ...) +(AddPairsGroupedUint32x8 ...) => (VPHADDD256 ...) (AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...) -(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...) +(AddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...) (AddSaturatedInt8x16 ...) => (VPADDSB128 ...) (AddSaturatedInt8x32 ...) => (VPADDSB256 ...) (AddSaturatedInt8x64 ...) => (VPADDSB512 ...) @@ -1217,19 +1217,19 @@ (SubUint64x4 ...) => (VPSUBQ256 ...) (SubUint64x8 ...) => (VPSUBQ512 ...) (SubPairsFloat32x4 ...) => (VHSUBPS128 ...) -(SubPairsFloat32x8 ...) => (VHSUBPS256 ...) (SubPairsFloat64x2 ...) => (VHSUBPD128 ...) -(SubPairsFloat64x4 ...) => (VHSUBPD256 ...) (SubPairsInt16x8 ...) => (VPHSUBW128 ...) -(SubPairsInt16x16 ...) => (VPHSUBW256 ...) (SubPairsInt32x4 ...) => (VPHSUBD128 ...) -(SubPairsInt32x8 ...) => (VPHSUBD256 ...) (SubPairsUint16x8 ...) => (VPHSUBW128 ...) -(SubPairsUint16x16 ...) => (VPHSUBW256 ...) (SubPairsUint32x4 ...) => (VPHSUBD128 ...) -(SubPairsUint32x8 ...) => (VPHSUBD256 ...) +(SubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...) +(SubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...) +(SubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...) +(SubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...) +(SubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...) +(SubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...) (SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...) -(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...) +(SubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...) (SubSaturatedInt8x16 ...) => (VPSUBSB128 ...) (SubSaturatedInt8x32 ...) => (VPSUBSB256 ...) (SubSaturatedInt8x64 ...) => (VPSUBSB512 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 8afa2bf259..0ae127a60d 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -48,19 +48,19 @@ func simdGenericOps() []opData { {name: "AddInt64x4", argLength: 2, commutative: true}, {name: "AddInt64x8", argLength: 2, commutative: true}, {name: "AddPairsFloat32x4", argLength: 2, commutative: false}, - {name: "AddPairsFloat32x8", argLength: 2, commutative: false}, {name: "AddPairsFloat64x2", argLength: 2, commutative: false}, - {name: "AddPairsFloat64x4", argLength: 2, commutative: false}, + {name: "AddPairsGroupedFloat32x8", argLength: 2, commutative: false}, + {name: "AddPairsGroupedFloat64x4", argLength: 2, commutative: false}, + {name: "AddPairsGroupedInt16x16", argLength: 2, commutative: false}, + {name: "AddPairsGroupedInt32x8", argLength: 2, commutative: false}, + {name: "AddPairsGroupedUint16x16", argLength: 2, commutative: false}, + {name: "AddPairsGroupedUint32x8", argLength: 2, commutative: false}, {name: "AddPairsInt16x8", argLength: 2, commutative: false}, - {name: "AddPairsInt16x16", argLength: 2, commutative: false}, {name: "AddPairsInt32x4", argLength: 2, commutative: false}, - {name: "AddPairsInt32x8", argLength: 2, commutative: false}, + {name: "AddPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false}, {name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false}, - {name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false}, {name: "AddPairsUint16x8", argLength: 2, commutative: false}, - {name: "AddPairsUint16x16", argLength: 2, commutative: false}, {name: "AddPairsUint32x4", argLength: 2, commutative: false}, - {name: "AddPairsUint32x8", argLength: 2, commutative: false}, {name: "AddSaturatedInt8x16", argLength: 2, commutative: true}, {name: "AddSaturatedInt8x32", argLength: 2, commutative: true}, {name: "AddSaturatedInt8x64", argLength: 2, commutative: true}, @@ -1036,19 +1036,19 @@ func simdGenericOps() []opData { {name: "SubInt64x4", argLength: 2, commutative: false}, {name: "SubInt64x8", argLength: 2, commutative: false}, {name: "SubPairsFloat32x4", argLength: 2, commutative: false}, - {name: "SubPairsFloat32x8", argLength: 2, commutative: false}, {name: "SubPairsFloat64x2", argLength: 2, commutative: false}, - {name: "SubPairsFloat64x4", argLength: 2, commutative: false}, + {name: "SubPairsGroupedFloat32x8", argLength: 2, commutative: false}, + {name: "SubPairsGroupedFloat64x4", argLength: 2, commutative: false}, + {name: "SubPairsGroupedInt16x16", argLength: 2, commutative: false}, + {name: "SubPairsGroupedInt32x8", argLength: 2, commutative: false}, + {name: "SubPairsGroupedUint16x16", argLength: 2, commutative: false}, + {name: "SubPairsGroupedUint32x8", argLength: 2, commutative: false}, {name: "SubPairsInt16x8", argLength: 2, commutative: false}, - {name: "SubPairsInt16x16", argLength: 2, commutative: false}, {name: "SubPairsInt32x4", argLength: 2, commutative: false}, - {name: "SubPairsInt32x8", argLength: 2, commutative: false}, + {name: "SubPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false}, {name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false}, - {name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false}, {name: "SubPairsUint16x8", argLength: 2, commutative: false}, - {name: "SubPairsUint16x16", argLength: 2, commutative: false}, {name: "SubPairsUint32x4", argLength: 2, commutative: false}, - {name: "SubPairsUint32x8", argLength: 2, commutative: false}, {name: "SubSaturatedInt8x16", argLength: 2, commutative: false}, {name: "SubSaturatedInt8x32", argLength: 2, commutative: false}, {name: "SubSaturatedInt8x64", argLength: 2, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 9ba5767596..f318adfd2f 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -6202,19 +6202,19 @@ const ( OpAddInt64x4 OpAddInt64x8 OpAddPairsFloat32x4 - OpAddPairsFloat32x8 OpAddPairsFloat64x2 - OpAddPairsFloat64x4 + OpAddPairsGroupedFloat32x8 + OpAddPairsGroupedFloat64x4 + OpAddPairsGroupedInt16x16 + OpAddPairsGroupedInt32x8 + OpAddPairsGroupedUint16x16 + OpAddPairsGroupedUint32x8 OpAddPairsInt16x8 - OpAddPairsInt16x16 OpAddPairsInt32x4 - OpAddPairsInt32x8 + OpAddPairsSaturatedGroupedInt16x16 OpAddPairsSaturatedInt16x8 - OpAddPairsSaturatedInt16x16 OpAddPairsUint16x8 - OpAddPairsUint16x16 OpAddPairsUint32x4 - OpAddPairsUint32x8 OpAddSaturatedInt8x16 OpAddSaturatedInt8x32 OpAddSaturatedInt8x64 @@ -7190,19 +7190,19 @@ const ( OpSubInt64x4 OpSubInt64x8 OpSubPairsFloat32x4 - OpSubPairsFloat32x8 OpSubPairsFloat64x2 - OpSubPairsFloat64x4 + OpSubPairsGroupedFloat32x8 + OpSubPairsGroupedFloat64x4 + OpSubPairsGroupedInt16x16 + OpSubPairsGroupedInt32x8 + OpSubPairsGroupedUint16x16 + OpSubPairsGroupedUint32x8 OpSubPairsInt16x8 - OpSubPairsInt16x16 OpSubPairsInt32x4 - OpSubPairsInt32x8 + OpSubPairsSaturatedGroupedInt16x16 OpSubPairsSaturatedInt16x8 - OpSubPairsSaturatedInt16x16 OpSubPairsUint16x8 - OpSubPairsUint16x16 OpSubPairsUint32x4 - OpSubPairsUint32x8 OpSubSaturatedInt8x16 OpSubSaturatedInt8x32 OpSubSaturatedInt8x64 @@ -89232,67 +89232,67 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "AddPairsFloat32x8", + name: "AddPairsFloat64x2", argLen: 2, generic: true, }, { - name: "AddPairsFloat64x2", + name: "AddPairsGroupedFloat32x8", argLen: 2, generic: true, }, { - name: "AddPairsFloat64x4", + name: "AddPairsGroupedFloat64x4", argLen: 2, generic: true, }, { - name: "AddPairsInt16x8", + name: "AddPairsGroupedInt16x16", argLen: 2, generic: true, }, { - name: "AddPairsInt16x16", + name: "AddPairsGroupedInt32x8", argLen: 2, generic: true, }, { - name: "AddPairsInt32x4", + name: "AddPairsGroupedUint16x16", argLen: 2, generic: true, }, { - name: "AddPairsInt32x8", + name: "AddPairsGroupedUint32x8", argLen: 2, generic: true, }, { - name: "AddPairsSaturatedInt16x8", + name: "AddPairsInt16x8", argLen: 2, generic: true, }, { - name: "AddPairsSaturatedInt16x16", + name: "AddPairsInt32x4", argLen: 2, generic: true, }, { - name: "AddPairsUint16x8", + name: "AddPairsSaturatedGroupedInt16x16", argLen: 2, generic: true, }, { - name: "AddPairsUint16x16", + name: "AddPairsSaturatedInt16x8", argLen: 2, generic: true, }, { - name: "AddPairsUint32x4", + name: "AddPairsUint16x8", argLen: 2, generic: true, }, { - name: "AddPairsUint32x8", + name: "AddPairsUint32x4", argLen: 2, generic: true, }, @@ -94394,67 +94394,67 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "SubPairsFloat32x8", + name: "SubPairsFloat64x2", argLen: 2, generic: true, }, { - name: "SubPairsFloat64x2", + name: "SubPairsGroupedFloat32x8", argLen: 2, generic: true, }, { - name: "SubPairsFloat64x4", + name: "SubPairsGroupedFloat64x4", argLen: 2, generic: true, }, { - name: "SubPairsInt16x8", + name: "SubPairsGroupedInt16x16", argLen: 2, generic: true, }, { - name: "SubPairsInt16x16", + name: "SubPairsGroupedInt32x8", argLen: 2, generic: true, }, { - name: "SubPairsInt32x4", + name: "SubPairsGroupedUint16x16", argLen: 2, generic: true, }, { - name: "SubPairsInt32x8", + name: "SubPairsGroupedUint32x8", argLen: 2, generic: true, }, { - name: "SubPairsSaturatedInt16x8", + name: "SubPairsInt16x8", argLen: 2, generic: true, }, { - name: "SubPairsSaturatedInt16x16", + name: "SubPairsInt32x4", argLen: 2, generic: true, }, { - name: "SubPairsUint16x8", + name: "SubPairsSaturatedGroupedInt16x16", argLen: 2, generic: true, }, { - name: "SubPairsUint16x16", + name: "SubPairsSaturatedInt16x8", argLen: 2, generic: true, }, { - name: "SubPairsUint32x4", + name: "SubPairsUint16x8", argLen: 2, generic: true, }, { - name: "SubPairsUint32x8", + name: "SubPairsUint32x4", argLen: 2, generic: true, }, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 35e9516f61..5fed6a8063 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2113,45 +2113,45 @@ func rewriteValueAMD64(v *Value) bool { case OpAddPairsFloat32x4: v.Op = OpAMD64VHADDPS128 return true - case OpAddPairsFloat32x8: - v.Op = OpAMD64VHADDPS256 - return true case OpAddPairsFloat64x2: v.Op = OpAMD64VHADDPD128 return true - case OpAddPairsFloat64x4: + case OpAddPairsGroupedFloat32x8: + v.Op = OpAMD64VHADDPS256 + return true + case OpAddPairsGroupedFloat64x4: v.Op = OpAMD64VHADDPD256 return true - case OpAddPairsInt16x16: + case OpAddPairsGroupedInt16x16: + v.Op = OpAMD64VPHADDW256 + return true + case OpAddPairsGroupedInt32x8: + v.Op = OpAMD64VPHADDD256 + return true + case OpAddPairsGroupedUint16x16: v.Op = OpAMD64VPHADDW256 return true + case OpAddPairsGroupedUint32x8: + v.Op = OpAMD64VPHADDD256 + return true case OpAddPairsInt16x8: v.Op = OpAMD64VPHADDW128 return true case OpAddPairsInt32x4: v.Op = OpAMD64VPHADDD128 return true - case OpAddPairsInt32x8: - v.Op = OpAMD64VPHADDD256 - return true - case OpAddPairsSaturatedInt16x16: + case OpAddPairsSaturatedGroupedInt16x16: v.Op = OpAMD64VPHADDSW256 return true case OpAddPairsSaturatedInt16x8: v.Op = OpAMD64VPHADDSW128 return true - case OpAddPairsUint16x16: - v.Op = OpAMD64VPHADDW256 - return true case OpAddPairsUint16x8: v.Op = OpAMD64VPHADDW128 return true case OpAddPairsUint32x4: v.Op = OpAMD64VPHADDD128 return true - case OpAddPairsUint32x8: - v.Op = OpAMD64VPHADDD256 - return true case OpAddPtr: v.Op = OpAMD64ADDQ return true @@ -5860,45 +5860,45 @@ func rewriteValueAMD64(v *Value) bool { case OpSubPairsFloat32x4: v.Op = OpAMD64VHSUBPS128 return true - case OpSubPairsFloat32x8: - v.Op = OpAMD64VHSUBPS256 - return true case OpSubPairsFloat64x2: v.Op = OpAMD64VHSUBPD128 return true - case OpSubPairsFloat64x4: + case OpSubPairsGroupedFloat32x8: + v.Op = OpAMD64VHSUBPS256 + return true + case OpSubPairsGroupedFloat64x4: v.Op = OpAMD64VHSUBPD256 return true - case OpSubPairsInt16x16: + case OpSubPairsGroupedInt16x16: + v.Op = OpAMD64VPHSUBW256 + return true + case OpSubPairsGroupedInt32x8: + v.Op = OpAMD64VPHSUBD256 + return true + case OpSubPairsGroupedUint16x16: v.Op = OpAMD64VPHSUBW256 return true + case OpSubPairsGroupedUint32x8: + v.Op = OpAMD64VPHSUBD256 + return true case OpSubPairsInt16x8: v.Op = OpAMD64VPHSUBW128 return true case OpSubPairsInt32x4: v.Op = OpAMD64VPHSUBD128 return true - case OpSubPairsInt32x8: - v.Op = OpAMD64VPHSUBD256 - return true - case OpSubPairsSaturatedInt16x16: + case OpSubPairsSaturatedGroupedInt16x16: v.Op = OpAMD64VPHSUBSW256 return true case OpSubPairsSaturatedInt16x8: v.Op = OpAMD64VPHSUBSW128 return true - case OpSubPairsUint16x16: - v.Op = OpAMD64VPHSUBW256 - return true case OpSubPairsUint16x8: v.Op = OpAMD64VPHSUBW128 return true case OpSubPairsUint32x4: v.Op = OpAMD64VPHSUBD128 return true - case OpSubPairsUint32x8: - v.Op = OpAMD64VPHSUBD256 - return true case OpSubPtr: v.Op = OpAMD64SUBQ return true diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 6769122aa4..59598f0052 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -69,19 +69,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x4.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x16.AddPairsSaturatedGrouped", opLen2(ssa.OpAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64) @@ -1193,19 +1193,19 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x4.Sub", opLen2(ssa.OpSubUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Sub", opLen2(ssa.OpSubUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x4.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x16.SubPairsSaturatedGrouped", opLen2(ssa.OpSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64) diff --git a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml index 35e8104218..ac5bd825db 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/categories.yaml @@ -17,21 +17,83 @@ // NAME subtracts corresponding elements of two vectors with saturation. - go: AddPairs commutative: false + out: + - elemBits: 16|32 documentation: !string |- // NAME horizontally adds adjacent pairs of elements. - // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. + // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. +- go: AddPairs + commutative: false + out: + - elemBits: 64 + documentation: !string |- + // NAME horizontally adds adjacent pairs of elements. + // For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1]. - go: SubPairs commutative: false + out: + - elemBits: 16|32 documentation: !string |- // NAME horizontally subtracts adjacent pairs of elements. - // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. + // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. +- go: SubPairs + commutative: false + out: + - elemBits: 64 + documentation: !string |- + // NAME horizontally subtracts adjacent pairs of elements. + // For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1]. - go: AddPairsSaturated commutative: false documentation: !string |- // NAME horizontally adds adjacent pairs of elements with saturation. - // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. + // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. - go: SubPairsSaturated commutative: false documentation: !string |- // NAME horizontally subtracts adjacent pairs of elements with saturation. - // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. + // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. +- go: AddPairsGrouped + commutative: false + out: + - elemBits: 16|32 + documentation: !string |- + // NAME horizontally adds adjacent pairs of elements. + // With each 128-bit as a group: + // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. +- go: AddPairsGrouped + commutative: false + out: + - elemBits: 64 + documentation: !string |- + // NAME horizontally adds adjacent pairs of elements. + // With each 128-bit as a group: + // for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1]. +- go: SubPairsGrouped + commutative: false + out: + - elemBits: 16|32 + documentation: !string |- + // NAME horizontally subtracts adjacent pairs of elements. + // With each 128-bit as a group: + // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. +- go: SubPairsGrouped + commutative: false + out: + - elemBits: 64 + documentation: !string |- + // NAME horizontally subtracts adjacent pairs of elements. + // With each 128-bit as a group: + // for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1]. +- go: AddPairsSaturatedGrouped + commutative: false + documentation: !string |- + // NAME horizontally adds adjacent pairs of elements with saturation. + // With each 128-bit as a group: + // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. +- go: SubPairsSaturatedGrouped + commutative: false + documentation: !string |- + // NAME horizontally subtracts adjacent pairs of elements with saturation. + // With each 128-bit as a group: + // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. diff --git a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml index 4423d8c7c6..17cee597d9 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/AddSub/go.yaml @@ -53,25 +53,71 @@ - *uint - go: AddPairs asm: "VPHADD[DW]" - in: *2any - out: *1any + in: &2any128 + - &any128 + go: $t + bits: 128 + - *any128 + out: &1any128 + - *any128 - go: SubPairs asm: "VPHSUB[DW]" - in: *2any - out: *1any + in: *2any128 + out: *1any128 - go: AddPairs asm: "VHADDP[SD]" # floats - in: *2any - out: *1any + in: *2any128 + out: *1any128 - go: SubPairs asm: "VHSUBP[SD]" # floats - in: *2any - out: *1any + in: *2any128 + out: *1any128 - go: AddPairsSaturated asm: "VPHADDS[DW]" - in: *2int - out: *1int + in: &2int128 + - &int128 + go: $t + base: int + bits: 128 + - *int128 + out: &1int128 + - *int128 - go: SubPairsSaturated asm: "VPHSUBS[DW]" - in: *2int - out: *1int + in: *2int128 + out: *1int128 +- go: AddPairsGrouped + asm: "VPHADD[DW]" + in: &2any256 + - &any256 + go: $t + bits: 256 + - *any256 + out: &1any256 + - *any256 +- go: SubPairsGrouped + asm: "VPHSUB[DW]" + in: *2any256 + out: *1any256 +- go: AddPairsGrouped + asm: "VHADDP[SD]" # floats + in: *2any256 + out: *1any256 +- go: SubPairsGrouped + asm: "VHSUBP[SD]" # floats + in: *2any256 + out: *1any256 +- go: AddPairsSaturatedGrouped + asm: "VPHADDS[DW]" + in: &2int256 + - &int256 + go: $t + base: int + bits: 256 + - *int256 + out: &1int256 + - *int256 +- go: SubPairsSaturatedGrouped + asm: "VPHSUBS[DW]" + in: *2int256 + out: *1int256 diff --git a/src/simd/archsimd/internal/simd_test/simd_test.go b/src/simd/archsimd/internal/simd_test/simd_test.go index 176faa136e..76b296c91e 100644 --- a/src/simd/archsimd/internal/simd_test/simd_test.go +++ b/src/simd/archsimd/internal/simd_test/simd_test.go @@ -13,6 +13,7 @@ import ( "simd/archsimd" "slices" "testing" + "unsafe" ) func TestMain(m *testing.M) { @@ -1228,3 +1229,70 @@ func TestClMul(t *testing.T) { foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0}) } + +func addPairsSlice[T number](a, b []T) []T { + r := make([]T, len(a)) + for i := range len(a) / 2 { + r[i] = a[2*i] + a[2*i+1] + r[i+len(a)/2] = b[2*i] + b[2*i+1] + } + return r +} + +func subPairsSlice[T number](a, b []T) []T { + r := make([]T, len(a)) + for i := range len(a) / 2 { + r[i] = a[2*i] - a[2*i+1] + r[i+len(a)/2] = b[2*i] - b[2*i+1] + } + return r +} + +func addPairsGroupedSlice[T number](a, b []T) []T { + group := int(128 / unsafe.Sizeof(a[0])) + r := make([]T, 0, len(a)) + for i := range len(a) / group { + r = append(r, addPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...) + } + return r +} + +func subPairsGroupedSlice[T number](a, b []T) []T { + group := int(128 / unsafe.Sizeof(a[0])) + r := make([]T, 0, len(a)) + for i := range len(a) / group { + r = append(r, subPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...) + } + return r +} + +func TestAddSubPairs(t *testing.T) { + testInt16x8Binary(t, archsimd.Int16x8.AddPairs, addPairsSlice[int16]) + testInt16x8Binary(t, archsimd.Int16x8.SubPairs, subPairsSlice[int16]) + testUint16x8Binary(t, archsimd.Uint16x8.AddPairs, addPairsSlice[uint16]) + testUint16x8Binary(t, archsimd.Uint16x8.SubPairs, subPairsSlice[uint16]) + testInt32x4Binary(t, archsimd.Int32x4.AddPairs, addPairsSlice[int32]) + testInt32x4Binary(t, archsimd.Int32x4.SubPairs, subPairsSlice[int32]) + testUint32x4Binary(t, archsimd.Uint32x4.AddPairs, addPairsSlice[uint32]) + testUint32x4Binary(t, archsimd.Uint32x4.SubPairs, subPairsSlice[uint32]) + testFloat32x4Binary(t, archsimd.Float32x4.AddPairs, addPairsSlice[float32]) + testFloat32x4Binary(t, archsimd.Float32x4.SubPairs, subPairsSlice[float32]) + testFloat64x2Binary(t, archsimd.Float64x2.AddPairs, addPairsSlice[float64]) + testFloat64x2Binary(t, archsimd.Float64x2.SubPairs, subPairsSlice[float64]) + + // Grouped versions + if archsimd.X86.AVX2() { + testInt16x16Binary(t, archsimd.Int16x16.AddPairsGrouped, addPairsGroupedSlice[int16]) + testInt16x16Binary(t, archsimd.Int16x16.SubPairsGrouped, subPairsGroupedSlice[int16]) + testUint16x16Binary(t, archsimd.Uint16x16.AddPairsGrouped, addPairsGroupedSlice[uint16]) + testUint16x16Binary(t, archsimd.Uint16x16.SubPairsGrouped, subPairsGroupedSlice[uint16]) + testInt32x8Binary(t, archsimd.Int32x8.AddPairsGrouped, addPairsGroupedSlice[int32]) + testInt32x8Binary(t, archsimd.Int32x8.SubPairsGrouped, subPairsGroupedSlice[int32]) + testUint32x8Binary(t, archsimd.Uint32x8.AddPairsGrouped, addPairsGroupedSlice[uint32]) + testUint32x8Binary(t, archsimd.Uint32x8.SubPairsGrouped, subPairsGroupedSlice[uint32]) + testFloat32x8Binary(t, archsimd.Float32x8.AddPairsGrouped, addPairsGroupedSlice[float32]) + testFloat32x8Binary(t, archsimd.Float32x8.SubPairsGrouped, subPairsGroupedSlice[float32]) + testFloat64x4Binary(t, archsimd.Float64x4.AddPairsGrouped, addPairsGroupedSlice[float64]) + testFloat64x4Binary(t, archsimd.Float64x4.SubPairsGrouped, subPairsGroupedSlice[float64]) + } +} diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go index 6f904f1cbc..e421f31891 100644 --- a/src/simd/archsimd/ops_amd64.go +++ b/src/simd/archsimd/ops_amd64.go @@ -349,90 +349,101 @@ func (x Uint64x8) Add(y Uint64x8) Uint64x8 /* AddPairs */ // AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // // Asm: VHADDPS, CPU Feature: AVX func (x Float32x4) AddPairs(y Float32x4) Float32x4 // AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VHADDPS, CPU Feature: AVX -func (x Float32x8) AddPairs(y Float32x8) Float32x8 - -// AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1]. // // Asm: VHADDPD, CPU Feature: AVX func (x Float64x2) AddPairs(y Float64x2) Float64x2 // AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // -// Asm: VHADDPD, CPU Feature: AVX -func (x Float64x4) AddPairs(y Float64x4) Float64x4 +// Asm: VPHADDW, CPU Feature: AVX +func (x Int16x8) AddPairs(y Int16x8) Int16x8 // AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // -// Asm: VPHADDW, CPU Feature: AVX -func (x Int16x8) AddPairs(y Int16x8) Int16x8 +// Asm: VPHADDD, CPU Feature: AVX +func (x Int32x4) AddPairs(y Int32x4) Int32x4 // AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // -// Asm: VPHADDW, CPU Feature: AVX2 -func (x Int16x16) AddPairs(y Int16x16) Int16x16 +// Asm: VPHADDW, CPU Feature: AVX +func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8 // AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // // Asm: VPHADDD, CPU Feature: AVX -func (x Int32x4) AddPairs(y Int32x4) Int32x4 +func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4 -// AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +/* AddPairsGrouped */ + +// AddPairsGrouped horizontally adds adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // -// Asm: VPHADDD, CPU Feature: AVX2 -func (x Int32x8) AddPairs(y Int32x8) Int32x8 +// Asm: VHADDPS, CPU Feature: AVX +func (x Float32x8) AddPairsGrouped(y Float32x8) Float32x8 -// AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// AddPairsGrouped horizontally adds adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1]. // -// Asm: VPHADDW, CPU Feature: AVX -func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8 +// Asm: VHADDPD, CPU Feature: AVX +func (x Float64x4) AddPairsGrouped(y Float64x4) Float64x4 -// AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// AddPairsGrouped horizontally adds adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // // Asm: VPHADDW, CPU Feature: AVX2 -func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16 +func (x Int16x16) AddPairsGrouped(y Int16x16) Int16x16 -// AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// AddPairsGrouped horizontally adds adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // -// Asm: VPHADDD, CPU Feature: AVX -func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4 +// Asm: VPHADDD, CPU Feature: AVX2 +func (x Int32x8) AddPairsGrouped(y Int32x8) Int32x8 -// AddPairs horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// AddPairsGrouped horizontally adds adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. +// +// Asm: VPHADDW, CPU Feature: AVX2 +func (x Uint16x16) AddPairsGrouped(y Uint16x16) Uint16x16 + +// AddPairsGrouped horizontally adds adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // // Asm: VPHADDD, CPU Feature: AVX2 -func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8 +func (x Uint32x8) AddPairsGrouped(y Uint32x8) Uint32x8 /* AddPairsSaturated */ // AddPairsSaturated horizontally adds adjacent pairs of elements with saturation. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // // Asm: VPHADDSW, CPU Feature: AVX func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8 -// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +/* AddPairsSaturatedGrouped */ + +// AddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...]. // // Asm: VPHADDSW, CPU Feature: AVX2 -func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16 +func (x Int16x16) AddPairsSaturatedGrouped(y Int16x16) Int16x16 /* AddSaturated */ @@ -7084,90 +7095,101 @@ func (x Uint64x8) Sub(y Uint64x8) Uint64x8 /* SubPairs */ // SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // // Asm: VHSUBPS, CPU Feature: AVX func (x Float32x4) SubPairs(y Float32x4) Float32x4 // SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VHSUBPS, CPU Feature: AVX -func (x Float32x8) SubPairs(y Float32x8) Float32x8 - -// SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1]. // // Asm: VHSUBPD, CPU Feature: AVX func (x Float64x2) SubPairs(y Float64x2) Float64x2 // SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // -// Asm: VHSUBPD, CPU Feature: AVX -func (x Float64x4) SubPairs(y Float64x4) Float64x4 +// Asm: VPHSUBW, CPU Feature: AVX +func (x Int16x8) SubPairs(y Int16x8) Int16x8 // SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // -// Asm: VPHSUBW, CPU Feature: AVX -func (x Int16x8) SubPairs(y Int16x8) Int16x8 +// Asm: VPHSUBD, CPU Feature: AVX +func (x Int32x4) SubPairs(y Int32x4) Int32x4 // SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // -// Asm: VPHSUBW, CPU Feature: AVX2 -func (x Int16x16) SubPairs(y Int16x16) Int16x16 +// Asm: VPHSUBW, CPU Feature: AVX +func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8 // SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // // Asm: VPHSUBD, CPU Feature: AVX -func (x Int32x4) SubPairs(y Int32x4) Int32x4 +func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4 -// SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +/* SubPairsGrouped */ + +// SubPairsGrouped horizontally subtracts adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // -// Asm: VPHSUBD, CPU Feature: AVX2 -func (x Int32x8) SubPairs(y Int32x8) Int32x8 +// Asm: VHSUBPS, CPU Feature: AVX +func (x Float32x8) SubPairsGrouped(y Float32x8) Float32x8 -// SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// SubPairsGrouped horizontally subtracts adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1]. // -// Asm: VPHSUBW, CPU Feature: AVX -func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8 +// Asm: VHSUBPD, CPU Feature: AVX +func (x Float64x4) SubPairsGrouped(y Float64x4) Float64x4 -// SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// SubPairsGrouped horizontally subtracts adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // // Asm: VPHSUBW, CPU Feature: AVX2 -func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16 +func (x Int16x16) SubPairsGrouped(y Int16x16) Int16x16 -// SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// SubPairsGrouped horizontally subtracts adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // -// Asm: VPHSUBD, CPU Feature: AVX -func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4 +// Asm: VPHSUBD, CPU Feature: AVX2 +func (x Int32x8) SubPairsGrouped(y Int32x8) Int32x8 -// SubPairs horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// SubPairsGrouped horizontally subtracts adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. +// +// Asm: VPHSUBW, CPU Feature: AVX2 +func (x Uint16x16) SubPairsGrouped(y Uint16x16) Uint16x16 + +// SubPairsGrouped horizontally subtracts adjacent pairs of elements. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // // Asm: VPHSUBD, CPU Feature: AVX2 -func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8 +func (x Uint32x8) SubPairsGrouped(y Uint32x8) Uint32x8 /* SubPairsSaturated */ // SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // // Asm: VPHSUBSW, CPU Feature: AVX func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8 -// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +/* SubPairsSaturatedGrouped */ + +// SubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation. +// With each 128-bit as a group: +// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...]. // // Asm: VPHSUBSW, CPU Feature: AVX2 -func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16 +func (x Int16x16) SubPairsSaturatedGrouped(y Int16x16) Int16x16 /* SubSaturated */