From c61743e4f0dde8870df5ac157f88353362d76b55 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Tue, 15 Jul 2025 05:13:55 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: reorder PairDotProdAccumulate This CL reorderes the param order of PairDotProdAccumulate family to be dotprod(x, y) + z instead of the old dotprod(y, z) + x. This CL also updates some documentation of other ML Ops. This CL added a test to test the behavior is correct. This CL is partially generated by CL 688115. Change-Id: I76a6ee55a2ad8e3aff388d7e4fa5218ec0e4800d Reviewed-on: https://go-review.googlesource.com/c/go/+/688095 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- .../compile/internal/ssa/_gen/simdAMD64.rules | 12 - .../internal/ssa/_gen/simdgenericOps.go | 68 ++- src/cmd/compile/internal/ssa/opGen.go | 246 ++++------ src/cmd/compile/internal/ssa/rewriteAMD64.go | 150 ------ src/cmd/compile/internal/ssagen/intrinsics.go | 12 + .../compile/internal/ssagen/simdintrinsics.go | 60 +-- src/simd/ops_amd64.go | 228 ++++----- src/simd/simd_test.go | 19 + src/simd/simd_wrapped_test.go | 449 +----------------- 9 files changed, 262 insertions(+), 982 deletions(-) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 8874417430..e5f17bdb1b 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -1350,15 +1350,9 @@ (SaturatedUnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSDS128 ...) (SaturatedUnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSDS256 ...) (SaturatedUnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSDS512 ...) -(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 ...) => (VPDPBUSDS128 ...) -(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 ...) => (VPDPBUSDS256 ...) -(SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 ...) => (VPDPBUSDS512 ...) (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) -(SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) -(SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) -(SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) (Set128Float32x8 ...) => (VINSERTF128256 ...) (Set128Float64x4 ...) => (VINSERTF128256 ...) (Set128Int8x32 ...) => (VINSERTI128256 ...) @@ -1762,15 +1756,9 @@ (UnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSD128 ...) (UnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSD256 ...) (UnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSD512 ...) -(UnsignedSignedQuadDotProdAccumulateUint32x4 ...) => (VPDPBUSD128 ...) -(UnsignedSignedQuadDotProdAccumulateUint32x8 ...) => (VPDPBUSD256 ...) -(UnsignedSignedQuadDotProdAccumulateUint32x16 ...) => (VPDPBUSD512 ...) (UnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) (UnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) (UnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) -(UnsignedSignedQuadDotProdAccumulateMaskedUint32x4 x y z mask) => (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) -(UnsignedSignedQuadDotProdAccumulateMaskedUint32x8 x y z mask) => (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) -(UnsignedSignedQuadDotProdAccumulateMaskedUint32x16 x y z mask) => (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) (XorInt8x16 ...) => (VPXOR128 ...) (XorInt8x32 ...) => (VPXOR256 ...) (XorInt16x8 ...) => (VPXOR128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 00e4baf141..c8fe1e9eee 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -914,8 +914,8 @@ func simdGenericOps() []opData { {name: "Permute2Int16x16", argLength: 3, commutative: false}, {name: "Permute2MaskedInt16x16", argLength: 4, commutative: false}, {name: "Permute2MaskedUint16x16", argLength: 4, commutative: false}, - {name: "PermuteMaskedInt16x16", argLength: 3, commutative: false}, {name: "PermuteMaskedUint16x16", argLength: 3, commutative: false}, + {name: "PermuteMaskedInt16x16", argLength: 3, commutative: false}, {name: "PopCountUint16x16", argLength: 1, commutative: false}, {name: "PopCountMaskedUint16x16", argLength: 2, commutative: false}, {name: "SaturatedAddUint16x16", argLength: 2, commutative: true}, @@ -960,12 +960,12 @@ func simdGenericOps() []opData { {name: "MulHighMaskedUint16x32", argLength: 3, commutative: true}, {name: "NotEqualUint16x32", argLength: 2, commutative: true}, {name: "NotEqualMaskedUint16x32", argLength: 3, commutative: true}, - {name: "PermuteInt16x32", argLength: 2, commutative: false}, {name: "PermuteUint16x32", argLength: 2, commutative: false}, - {name: "Permute2Int16x32", argLength: 3, commutative: false}, + {name: "PermuteInt16x32", argLength: 2, commutative: false}, {name: "Permute2Uint16x32", argLength: 3, commutative: false}, - {name: "Permute2MaskedInt16x32", argLength: 4, commutative: false}, + {name: "Permute2Int16x32", argLength: 3, commutative: false}, {name: "Permute2MaskedUint16x32", argLength: 4, commutative: false}, + {name: "Permute2MaskedInt16x32", argLength: 4, commutative: false}, {name: "PermuteMaskedUint16x32", argLength: 3, commutative: false}, {name: "PermuteMaskedInt16x32", argLength: 3, commutative: false}, {name: "PopCountUint16x32", argLength: 1, commutative: false}, @@ -1016,14 +1016,14 @@ func simdGenericOps() []opData { {name: "OrUint16x8", argLength: 2, commutative: true}, {name: "PairwiseAddUint16x8", argLength: 2, commutative: false}, {name: "PairwiseSubUint16x8", argLength: 2, commutative: false}, - {name: "PermuteUint16x8", argLength: 2, commutative: false}, {name: "PermuteInt16x8", argLength: 2, commutative: false}, + {name: "PermuteUint16x8", argLength: 2, commutative: false}, {name: "Permute2Int16x8", argLength: 3, commutative: false}, {name: "Permute2Uint16x8", argLength: 3, commutative: false}, - {name: "Permute2MaskedUint16x8", argLength: 4, commutative: false}, {name: "Permute2MaskedInt16x8", argLength: 4, commutative: false}, - {name: "PermuteMaskedInt16x8", argLength: 3, commutative: false}, + {name: "Permute2MaskedUint16x8", argLength: 4, commutative: false}, {name: "PermuteMaskedUint16x8", argLength: 3, commutative: false}, + {name: "PermuteMaskedInt16x8", argLength: 3, commutative: false}, {name: "PopCountUint16x8", argLength: 1, commutative: false}, {name: "PopCountMaskedUint16x8", argLength: 2, commutative: false}, {name: "SaturatedAddUint16x8", argLength: 2, commutative: true}, @@ -1070,26 +1070,24 @@ func simdGenericOps() []opData { {name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true}, {name: "OrUint32x16", argLength: 2, commutative: true}, {name: "OrMaskedUint32x16", argLength: 3, commutative: true}, - {name: "PermuteInt32x16", argLength: 2, commutative: false}, {name: "PermuteFloat32x16", argLength: 2, commutative: false}, + {name: "PermuteInt32x16", argLength: 2, commutative: false}, {name: "PermuteUint32x16", argLength: 2, commutative: false}, {name: "Permute2Uint32x16", argLength: 3, commutative: false}, {name: "Permute2Float32x16", argLength: 3, commutative: false}, {name: "Permute2Int32x16", argLength: 3, commutative: false}, - {name: "Permute2MaskedUint32x16", argLength: 4, commutative: false}, {name: "Permute2MaskedInt32x16", argLength: 4, commutative: false}, {name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false}, + {name: "Permute2MaskedUint32x16", argLength: 4, commutative: false}, + {name: "PermuteMaskedInt32x16", argLength: 3, commutative: false}, {name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false}, {name: "PermuteMaskedUint32x16", argLength: 3, commutative: false}, - {name: "PermuteMaskedInt32x16", argLength: 3, commutative: false}, {name: "PopCountUint32x16", argLength: 1, commutative: false}, {name: "PopCountMaskedUint32x16", argLength: 2, commutative: false}, {name: "RotateLeftUint32x16", argLength: 2, commutative: false}, {name: "RotateLeftMaskedUint32x16", argLength: 3, commutative: false}, {name: "RotateRightUint32x16", argLength: 2, commutative: false}, {name: "RotateRightMaskedUint32x16", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16", argLength: 4, commutative: false}, {name: "ShiftAllLeftUint32x16", argLength: 2, commutative: false}, {name: "ShiftAllLeftMaskedUint32x16", argLength: 3, commutative: false}, {name: "ShiftAllRightUint32x16", argLength: 2, commutative: false}, @@ -1104,8 +1102,6 @@ func simdGenericOps() []opData { {name: "ShiftRightMaskedUint32x16", argLength: 3, commutative: false}, {name: "SubUint32x16", argLength: 2, commutative: false}, {name: "SubMaskedUint32x16", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateUint32x16", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateMaskedUint32x16", argLength: 4, commutative: false}, {name: "XorUint32x16", argLength: 2, commutative: true}, {name: "XorMaskedUint32x16", argLength: 3, commutative: true}, {name: "AddUint32x4", argLength: 2, commutative: true}, @@ -1136,20 +1132,18 @@ func simdGenericOps() []opData { {name: "OrMaskedUint32x4", argLength: 3, commutative: true}, {name: "PairwiseAddUint32x4", argLength: 2, commutative: false}, {name: "PairwiseSubUint32x4", argLength: 2, commutative: false}, + {name: "Permute2Float32x4", argLength: 3, commutative: false}, {name: "Permute2Uint32x4", argLength: 3, commutative: false}, {name: "Permute2Int32x4", argLength: 3, commutative: false}, - {name: "Permute2Float32x4", argLength: 3, commutative: false}, - {name: "Permute2MaskedFloat32x4", argLength: 4, commutative: false}, {name: "Permute2MaskedInt32x4", argLength: 4, commutative: false}, {name: "Permute2MaskedUint32x4", argLength: 4, commutative: false}, + {name: "Permute2MaskedFloat32x4", argLength: 4, commutative: false}, {name: "PopCountUint32x4", argLength: 1, commutative: false}, {name: "PopCountMaskedUint32x4", argLength: 2, commutative: false}, {name: "RotateLeftUint32x4", argLength: 2, commutative: false}, {name: "RotateLeftMaskedUint32x4", argLength: 3, commutative: false}, {name: "RotateRightUint32x4", argLength: 2, commutative: false}, {name: "RotateRightMaskedUint32x4", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4", argLength: 4, commutative: false}, {name: "ShiftAllLeftUint32x4", argLength: 2, commutative: false}, {name: "ShiftAllLeftMaskedUint32x4", argLength: 3, commutative: false}, {name: "ShiftAllRightUint32x4", argLength: 2, commutative: false}, @@ -1164,8 +1158,6 @@ func simdGenericOps() []opData { {name: "ShiftRightMaskedUint32x4", argLength: 3, commutative: false}, {name: "SubUint32x4", argLength: 2, commutative: false}, {name: "SubMaskedUint32x4", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateUint32x4", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateMaskedUint32x4", argLength: 4, commutative: false}, {name: "XorUint32x4", argLength: 2, commutative: true}, {name: "XorMaskedUint32x4", argLength: 3, commutative: true}, {name: "AddUint32x8", argLength: 2, commutative: true}, @@ -1197,14 +1189,14 @@ func simdGenericOps() []opData { {name: "PairwiseAddUint32x8", argLength: 2, commutative: false}, {name: "PairwiseSubUint32x8", argLength: 2, commutative: false}, {name: "PermuteUint32x8", argLength: 2, commutative: false}, - {name: "PermuteInt32x8", argLength: 2, commutative: false}, {name: "PermuteFloat32x8", argLength: 2, commutative: false}, - {name: "Permute2Uint32x8", argLength: 3, commutative: false}, - {name: "Permute2Float32x8", argLength: 3, commutative: false}, + {name: "PermuteInt32x8", argLength: 2, commutative: false}, {name: "Permute2Int32x8", argLength: 3, commutative: false}, + {name: "Permute2Float32x8", argLength: 3, commutative: false}, + {name: "Permute2Uint32x8", argLength: 3, commutative: false}, {name: "Permute2MaskedFloat32x8", argLength: 4, commutative: false}, - {name: "Permute2MaskedInt32x8", argLength: 4, commutative: false}, {name: "Permute2MaskedUint32x8", argLength: 4, commutative: false}, + {name: "Permute2MaskedInt32x8", argLength: 4, commutative: false}, {name: "PermuteMaskedInt32x8", argLength: 3, commutative: false}, {name: "PermuteMaskedUint32x8", argLength: 3, commutative: false}, {name: "PermuteMaskedFloat32x8", argLength: 3, commutative: false}, @@ -1214,8 +1206,6 @@ func simdGenericOps() []opData { {name: "RotateLeftMaskedUint32x8", argLength: 3, commutative: false}, {name: "RotateRightUint32x8", argLength: 2, commutative: false}, {name: "RotateRightMaskedUint32x8", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 3, commutative: false}, - {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8", argLength: 4, commutative: false}, {name: "ShiftAllLeftUint32x8", argLength: 2, commutative: false}, {name: "ShiftAllLeftMaskedUint32x8", argLength: 3, commutative: false}, {name: "ShiftAllRightUint32x8", argLength: 2, commutative: false}, @@ -1230,8 +1220,6 @@ func simdGenericOps() []opData { {name: "ShiftRightMaskedUint32x8", argLength: 3, commutative: false}, {name: "SubUint32x8", argLength: 2, commutative: false}, {name: "SubMaskedUint32x8", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateUint32x8", argLength: 3, commutative: false}, - {name: "UnsignedSignedQuadDotProdAccumulateMaskedUint32x8", argLength: 4, commutative: false}, {name: "XorUint32x8", argLength: 2, commutative: true}, {name: "XorMaskedUint32x8", argLength: 3, commutative: true}, {name: "AddUint64x2", argLength: 2, commutative: true}, @@ -1265,8 +1253,8 @@ func simdGenericOps() []opData { {name: "Permute2Uint64x2", argLength: 3, commutative: false}, {name: "Permute2Int64x2", argLength: 3, commutative: false}, {name: "Permute2MaskedInt64x2", argLength: 4, commutative: false}, - {name: "Permute2MaskedUint64x2", argLength: 4, commutative: false}, {name: "Permute2MaskedFloat64x2", argLength: 4, commutative: false}, + {name: "Permute2MaskedUint64x2", argLength: 4, commutative: false}, {name: "PopCountUint64x2", argLength: 1, commutative: false}, {name: "PopCountMaskedUint64x2", argLength: 2, commutative: false}, {name: "RotateLeftUint64x2", argLength: 2, commutative: false}, @@ -1316,18 +1304,18 @@ func simdGenericOps() []opData { {name: "NotEqualMaskedUint64x4", argLength: 3, commutative: true}, {name: "OrUint64x4", argLength: 2, commutative: true}, {name: "OrMaskedUint64x4", argLength: 3, commutative: true}, - {name: "PermuteFloat64x4", argLength: 2, commutative: false}, {name: "PermuteUint64x4", argLength: 2, commutative: false}, {name: "PermuteInt64x4", argLength: 2, commutative: false}, + {name: "PermuteFloat64x4", argLength: 2, commutative: false}, + {name: "Permute2Float64x4", argLength: 3, commutative: false}, {name: "Permute2Int64x4", argLength: 3, commutative: false}, {name: "Permute2Uint64x4", argLength: 3, commutative: false}, - {name: "Permute2Float64x4", argLength: 3, commutative: false}, {name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false}, {name: "Permute2MaskedUint64x4", argLength: 4, commutative: false}, {name: "Permute2MaskedInt64x4", argLength: 4, commutative: false}, {name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false}, - {name: "PermuteMaskedUint64x4", argLength: 3, commutative: false}, {name: "PermuteMaskedInt64x4", argLength: 3, commutative: false}, + {name: "PermuteMaskedUint64x4", argLength: 3, commutative: false}, {name: "PopCountUint64x4", argLength: 1, commutative: false}, {name: "PopCountMaskedUint64x4", argLength: 2, commutative: false}, {name: "RotateLeftUint64x4", argLength: 2, commutative: false}, @@ -1377,18 +1365,18 @@ func simdGenericOps() []opData { {name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true}, {name: "OrUint64x8", argLength: 2, commutative: true}, {name: "OrMaskedUint64x8", argLength: 3, commutative: true}, + {name: "PermuteFloat64x8", argLength: 2, commutative: false}, {name: "PermuteInt64x8", argLength: 2, commutative: false}, {name: "PermuteUint64x8", argLength: 2, commutative: false}, - {name: "PermuteFloat64x8", argLength: 2, commutative: false}, - {name: "Permute2Uint64x8", argLength: 3, commutative: false}, - {name: "Permute2Float64x8", argLength: 3, commutative: false}, {name: "Permute2Int64x8", argLength: 3, commutative: false}, + {name: "Permute2Float64x8", argLength: 3, commutative: false}, + {name: "Permute2Uint64x8", argLength: 3, commutative: false}, {name: "Permute2MaskedUint64x8", argLength: 4, commutative: false}, - {name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false}, {name: "Permute2MaskedInt64x8", argLength: 4, commutative: false}, + {name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false}, {name: "PermuteMaskedUint64x8", argLength: 3, commutative: false}, - {name: "PermuteMaskedInt64x8", argLength: 3, commutative: false}, {name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false}, + {name: "PermuteMaskedInt64x8", argLength: 3, commutative: false}, {name: "PopCountUint64x8", argLength: 1, commutative: false}, {name: "PopCountMaskedUint64x8", argLength: 2, commutative: false}, {name: "RotateLeftUint64x8", argLength: 2, commutative: false}, @@ -1439,8 +1427,8 @@ func simdGenericOps() []opData { {name: "OrUint8x16", argLength: 2, commutative: true}, {name: "PermuteUint8x16", argLength: 2, commutative: false}, {name: "PermuteInt8x16", argLength: 2, commutative: false}, - {name: "Permute2Int8x16", argLength: 3, commutative: false}, {name: "Permute2Uint8x16", argLength: 3, commutative: false}, + {name: "Permute2Int8x16", argLength: 3, commutative: false}, {name: "Permute2MaskedInt8x16", argLength: 4, commutative: false}, {name: "Permute2MaskedUint8x16", argLength: 4, commutative: false}, {name: "PermuteMaskedUint8x16", argLength: 3, commutative: false}, @@ -1486,10 +1474,10 @@ func simdGenericOps() []opData { {name: "PermuteInt8x32", argLength: 2, commutative: false}, {name: "Permute2Int8x32", argLength: 3, commutative: false}, {name: "Permute2Uint8x32", argLength: 3, commutative: false}, - {name: "Permute2MaskedInt8x32", argLength: 4, commutative: false}, {name: "Permute2MaskedUint8x32", argLength: 4, commutative: false}, - {name: "PermuteMaskedInt8x32", argLength: 3, commutative: false}, + {name: "Permute2MaskedInt8x32", argLength: 4, commutative: false}, {name: "PermuteMaskedUint8x32", argLength: 3, commutative: false}, + {name: "PermuteMaskedInt8x32", argLength: 3, commutative: false}, {name: "PopCountUint8x32", argLength: 1, commutative: false}, {name: "PopCountMaskedUint8x32", argLength: 2, commutative: false}, {name: "SaturatedAddUint8x32", argLength: 2, commutative: true}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 35612493ea..29058f0b19 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -5314,8 +5314,8 @@ const ( OpPermute2Int16x16 OpPermute2MaskedInt16x16 OpPermute2MaskedUint16x16 - OpPermuteMaskedInt16x16 OpPermuteMaskedUint16x16 + OpPermuteMaskedInt16x16 OpPopCountUint16x16 OpPopCountMaskedUint16x16 OpSaturatedAddUint16x16 @@ -5360,12 +5360,12 @@ const ( OpMulHighMaskedUint16x32 OpNotEqualUint16x32 OpNotEqualMaskedUint16x32 - OpPermuteInt16x32 OpPermuteUint16x32 - OpPermute2Int16x32 + OpPermuteInt16x32 OpPermute2Uint16x32 - OpPermute2MaskedInt16x32 + OpPermute2Int16x32 OpPermute2MaskedUint16x32 + OpPermute2MaskedInt16x32 OpPermuteMaskedUint16x32 OpPermuteMaskedInt16x32 OpPopCountUint16x32 @@ -5416,14 +5416,14 @@ const ( OpOrUint16x8 OpPairwiseAddUint16x8 OpPairwiseSubUint16x8 - OpPermuteUint16x8 OpPermuteInt16x8 + OpPermuteUint16x8 OpPermute2Int16x8 OpPermute2Uint16x8 - OpPermute2MaskedUint16x8 OpPermute2MaskedInt16x8 - OpPermuteMaskedInt16x8 + OpPermute2MaskedUint16x8 OpPermuteMaskedUint16x8 + OpPermuteMaskedInt16x8 OpPopCountUint16x8 OpPopCountMaskedUint16x8 OpSaturatedAddUint16x8 @@ -5470,26 +5470,24 @@ const ( OpNotEqualMaskedUint32x16 OpOrUint32x16 OpOrMaskedUint32x16 - OpPermuteInt32x16 OpPermuteFloat32x16 + OpPermuteInt32x16 OpPermuteUint32x16 OpPermute2Uint32x16 OpPermute2Float32x16 OpPermute2Int32x16 - OpPermute2MaskedUint32x16 OpPermute2MaskedInt32x16 OpPermute2MaskedFloat32x16 + OpPermute2MaskedUint32x16 + OpPermuteMaskedInt32x16 OpPermuteMaskedFloat32x16 OpPermuteMaskedUint32x16 - OpPermuteMaskedInt32x16 OpPopCountUint32x16 OpPopCountMaskedUint32x16 OpRotateLeftUint32x16 OpRotateLeftMaskedUint32x16 OpRotateRightUint32x16 OpRotateRightMaskedUint32x16 - OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 - OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16 OpShiftAllLeftUint32x16 OpShiftAllLeftMaskedUint32x16 OpShiftAllRightUint32x16 @@ -5504,8 +5502,6 @@ const ( OpShiftRightMaskedUint32x16 OpSubUint32x16 OpSubMaskedUint32x16 - OpUnsignedSignedQuadDotProdAccumulateUint32x16 - OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x16 OpXorUint32x16 OpXorMaskedUint32x16 OpAddUint32x4 @@ -5536,20 +5532,18 @@ const ( OpOrMaskedUint32x4 OpPairwiseAddUint32x4 OpPairwiseSubUint32x4 + OpPermute2Float32x4 OpPermute2Uint32x4 OpPermute2Int32x4 - OpPermute2Float32x4 - OpPermute2MaskedFloat32x4 OpPermute2MaskedInt32x4 OpPermute2MaskedUint32x4 + OpPermute2MaskedFloat32x4 OpPopCountUint32x4 OpPopCountMaskedUint32x4 OpRotateLeftUint32x4 OpRotateLeftMaskedUint32x4 OpRotateRightUint32x4 OpRotateRightMaskedUint32x4 - OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 - OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4 OpShiftAllLeftUint32x4 OpShiftAllLeftMaskedUint32x4 OpShiftAllRightUint32x4 @@ -5564,8 +5558,6 @@ const ( OpShiftRightMaskedUint32x4 OpSubUint32x4 OpSubMaskedUint32x4 - OpUnsignedSignedQuadDotProdAccumulateUint32x4 - OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x4 OpXorUint32x4 OpXorMaskedUint32x4 OpAddUint32x8 @@ -5597,14 +5589,14 @@ const ( OpPairwiseAddUint32x8 OpPairwiseSubUint32x8 OpPermuteUint32x8 - OpPermuteInt32x8 OpPermuteFloat32x8 - OpPermute2Uint32x8 - OpPermute2Float32x8 + OpPermuteInt32x8 OpPermute2Int32x8 + OpPermute2Float32x8 + OpPermute2Uint32x8 OpPermute2MaskedFloat32x8 - OpPermute2MaskedInt32x8 OpPermute2MaskedUint32x8 + OpPermute2MaskedInt32x8 OpPermuteMaskedInt32x8 OpPermuteMaskedUint32x8 OpPermuteMaskedFloat32x8 @@ -5614,8 +5606,6 @@ const ( OpRotateLeftMaskedUint32x8 OpRotateRightUint32x8 OpRotateRightMaskedUint32x8 - OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 - OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8 OpShiftAllLeftUint32x8 OpShiftAllLeftMaskedUint32x8 OpShiftAllRightUint32x8 @@ -5630,8 +5620,6 @@ const ( OpShiftRightMaskedUint32x8 OpSubUint32x8 OpSubMaskedUint32x8 - OpUnsignedSignedQuadDotProdAccumulateUint32x8 - OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x8 OpXorUint32x8 OpXorMaskedUint32x8 OpAddUint64x2 @@ -5665,8 +5653,8 @@ const ( OpPermute2Uint64x2 OpPermute2Int64x2 OpPermute2MaskedInt64x2 - OpPermute2MaskedUint64x2 OpPermute2MaskedFloat64x2 + OpPermute2MaskedUint64x2 OpPopCountUint64x2 OpPopCountMaskedUint64x2 OpRotateLeftUint64x2 @@ -5716,18 +5704,18 @@ const ( OpNotEqualMaskedUint64x4 OpOrUint64x4 OpOrMaskedUint64x4 - OpPermuteFloat64x4 OpPermuteUint64x4 OpPermuteInt64x4 + OpPermuteFloat64x4 + OpPermute2Float64x4 OpPermute2Int64x4 OpPermute2Uint64x4 - OpPermute2Float64x4 OpPermute2MaskedFloat64x4 OpPermute2MaskedUint64x4 OpPermute2MaskedInt64x4 OpPermuteMaskedFloat64x4 - OpPermuteMaskedUint64x4 OpPermuteMaskedInt64x4 + OpPermuteMaskedUint64x4 OpPopCountUint64x4 OpPopCountMaskedUint64x4 OpRotateLeftUint64x4 @@ -5777,18 +5765,18 @@ const ( OpNotEqualMaskedUint64x8 OpOrUint64x8 OpOrMaskedUint64x8 + OpPermuteFloat64x8 OpPermuteInt64x8 OpPermuteUint64x8 - OpPermuteFloat64x8 - OpPermute2Uint64x8 - OpPermute2Float64x8 OpPermute2Int64x8 + OpPermute2Float64x8 + OpPermute2Uint64x8 OpPermute2MaskedUint64x8 - OpPermute2MaskedFloat64x8 OpPermute2MaskedInt64x8 + OpPermute2MaskedFloat64x8 OpPermuteMaskedUint64x8 - OpPermuteMaskedInt64x8 OpPermuteMaskedFloat64x8 + OpPermuteMaskedInt64x8 OpPopCountUint64x8 OpPopCountMaskedUint64x8 OpRotateLeftUint64x8 @@ -5839,8 +5827,8 @@ const ( OpOrUint8x16 OpPermuteUint8x16 OpPermuteInt8x16 - OpPermute2Int8x16 OpPermute2Uint8x16 + OpPermute2Int8x16 OpPermute2MaskedInt8x16 OpPermute2MaskedUint8x16 OpPermuteMaskedUint8x16 @@ -5886,10 +5874,10 @@ const ( OpPermuteInt8x32 OpPermute2Int8x32 OpPermute2Uint8x32 - OpPermute2MaskedInt8x32 OpPermute2MaskedUint8x32 - OpPermuteMaskedInt8x32 + OpPermute2MaskedInt8x32 OpPermuteMaskedUint8x32 + OpPermuteMaskedInt8x32 OpPopCountUint8x32 OpPopCountMaskedUint8x32 OpSaturatedAddUint8x32 @@ -65610,12 +65598,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteMaskedInt16x16", + name: "PermuteMaskedUint16x16", argLen: 3, generic: true, }, { - name: "PermuteMaskedUint16x16", + name: "PermuteMaskedInt16x16", argLen: 3, generic: true, }, @@ -65857,32 +65845,32 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteInt16x32", + name: "PermuteUint16x32", argLen: 2, generic: true, }, { - name: "PermuteUint16x32", + name: "PermuteInt16x32", argLen: 2, generic: true, }, { - name: "Permute2Int16x32", + name: "Permute2Uint16x32", argLen: 3, generic: true, }, { - name: "Permute2Uint16x32", + name: "Permute2Int16x32", argLen: 3, generic: true, }, { - name: "Permute2MaskedInt16x32", + name: "Permute2MaskedUint16x32", argLen: 4, generic: true, }, { - name: "Permute2MaskedUint16x32", + name: "Permute2MaskedInt16x32", argLen: 4, generic: true, }, @@ -66155,12 +66143,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteUint16x8", + name: "PermuteInt16x8", argLen: 2, generic: true, }, { - name: "PermuteInt16x8", + name: "PermuteUint16x8", argLen: 2, generic: true, }, @@ -66175,22 +66163,22 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2MaskedUint16x8", + name: "Permute2MaskedInt16x8", argLen: 4, generic: true, }, { - name: "Permute2MaskedInt16x8", + name: "Permute2MaskedUint16x8", argLen: 4, generic: true, }, { - name: "PermuteMaskedInt16x8", + name: "PermuteMaskedUint16x8", argLen: 3, generic: true, }, { - name: "PermuteMaskedUint16x8", + name: "PermuteMaskedInt16x8", argLen: 3, generic: true, }, @@ -66442,12 +66430,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteInt32x16", + name: "PermuteFloat32x16", argLen: 2, generic: true, }, { - name: "PermuteFloat32x16", + name: "PermuteInt32x16", argLen: 2, generic: true, }, @@ -66472,32 +66460,32 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2MaskedUint32x16", + name: "Permute2MaskedInt32x16", argLen: 4, generic: true, }, { - name: "Permute2MaskedInt32x16", + name: "Permute2MaskedFloat32x16", argLen: 4, generic: true, }, { - name: "Permute2MaskedFloat32x16", + name: "Permute2MaskedUint32x16", argLen: 4, generic: true, }, { - name: "PermuteMaskedFloat32x16", + name: "PermuteMaskedInt32x16", argLen: 3, generic: true, }, { - name: "PermuteMaskedUint32x16", + name: "PermuteMaskedFloat32x16", argLen: 3, generic: true, }, { - name: "PermuteMaskedInt32x16", + name: "PermuteMaskedUint32x16", argLen: 3, generic: true, }, @@ -66531,16 +66519,6 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16", - argLen: 4, - generic: true, - }, { name: "ShiftAllLeftUint32x16", argLen: 2, @@ -66611,16 +66589,6 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, - { - name: "UnsignedSignedQuadDotProdAccumulateUint32x16", - argLen: 3, - generic: true, - }, - { - name: "UnsignedSignedQuadDotProdAccumulateMaskedUint32x16", - argLen: 4, - generic: true, - }, { name: "XorUint32x16", argLen: 2, @@ -66789,32 +66757,32 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2Uint32x4", + name: "Permute2Float32x4", argLen: 3, generic: true, }, { - name: "Permute2Int32x4", + name: "Permute2Uint32x4", argLen: 3, generic: true, }, { - name: "Permute2Float32x4", + name: "Permute2Int32x4", argLen: 3, generic: true, }, { - name: "Permute2MaskedFloat32x4", + name: "Permute2MaskedInt32x4", argLen: 4, generic: true, }, { - name: "Permute2MaskedInt32x4", + name: "Permute2MaskedUint32x4", argLen: 4, generic: true, }, { - name: "Permute2MaskedUint32x4", + name: "Permute2MaskedFloat32x4", argLen: 4, generic: true, }, @@ -66848,16 +66816,6 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4", - argLen: 4, - generic: true, - }, { name: "ShiftAllLeftUint32x4", argLen: 2, @@ -66928,16 +66886,6 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, - { - name: "UnsignedSignedQuadDotProdAccumulateUint32x4", - argLen: 3, - generic: true, - }, - { - name: "UnsignedSignedQuadDotProdAccumulateMaskedUint32x4", - argLen: 4, - generic: true, - }, { name: "XorUint32x4", argLen: 2, @@ -67111,17 +67059,17 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteInt32x8", + name: "PermuteFloat32x8", argLen: 2, generic: true, }, { - name: "PermuteFloat32x8", + name: "PermuteInt32x8", argLen: 2, generic: true, }, { - name: "Permute2Uint32x8", + name: "Permute2Int32x8", argLen: 3, generic: true, }, @@ -67131,7 +67079,7 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2Int32x8", + name: "Permute2Uint32x8", argLen: 3, generic: true, }, @@ -67141,12 +67089,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2MaskedInt32x8", + name: "Permute2MaskedUint32x8", argLen: 4, generic: true, }, { - name: "Permute2MaskedUint32x8", + name: "Permute2MaskedInt32x8", argLen: 4, generic: true, }, @@ -67195,16 +67143,6 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8", - argLen: 3, - generic: true, - }, - { - name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8", - argLen: 4, - generic: true, - }, { name: "ShiftAllLeftUint32x8", argLen: 2, @@ -67275,16 +67213,6 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, - { - name: "UnsignedSignedQuadDotProdAccumulateUint32x8", - argLen: 3, - generic: true, - }, - { - name: "UnsignedSignedQuadDotProdAccumulateMaskedUint32x8", - argLen: 4, - generic: true, - }, { name: "XorUint32x8", argLen: 2, @@ -67469,12 +67397,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2MaskedUint64x2", + name: "Permute2MaskedFloat64x2", argLen: 4, generic: true, }, { - name: "Permute2MaskedFloat64x2", + name: "Permute2MaskedUint64x2", argLen: 4, generic: true, }, @@ -67742,32 +67670,32 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteFloat64x4", + name: "PermuteUint64x4", argLen: 2, generic: true, }, { - name: "PermuteUint64x4", + name: "PermuteInt64x4", argLen: 2, generic: true, }, { - name: "PermuteInt64x4", + name: "PermuteFloat64x4", argLen: 2, generic: true, }, { - name: "Permute2Int64x4", + name: "Permute2Float64x4", argLen: 3, generic: true, }, { - name: "Permute2Uint64x4", + name: "Permute2Int64x4", argLen: 3, generic: true, }, { - name: "Permute2Float64x4", + name: "Permute2Uint64x4", argLen: 3, generic: true, }, @@ -67792,12 +67720,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteMaskedUint64x4", + name: "PermuteMaskedInt64x4", argLen: 3, generic: true, }, { - name: "PermuteMaskedInt64x4", + name: "PermuteMaskedUint64x4", argLen: 3, generic: true, }, @@ -68065,22 +67993,22 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteInt64x8", + name: "PermuteFloat64x8", argLen: 2, generic: true, }, { - name: "PermuteUint64x8", + name: "PermuteInt64x8", argLen: 2, generic: true, }, { - name: "PermuteFloat64x8", + name: "PermuteUint64x8", argLen: 2, generic: true, }, { - name: "Permute2Uint64x8", + name: "Permute2Int64x8", argLen: 3, generic: true, }, @@ -68090,7 +68018,7 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2Int64x8", + name: "Permute2Uint64x8", argLen: 3, generic: true, }, @@ -68100,12 +68028,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2MaskedFloat64x8", + name: "Permute2MaskedInt64x8", argLen: 4, generic: true, }, { - name: "Permute2MaskedInt64x8", + name: "Permute2MaskedFloat64x8", argLen: 4, generic: true, }, @@ -68115,12 +68043,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "PermuteMaskedInt64x8", + name: "PermuteMaskedFloat64x8", argLen: 3, generic: true, }, { - name: "PermuteMaskedFloat64x8", + name: "PermuteMaskedInt64x8", argLen: 3, generic: true, }, @@ -68391,12 +68319,12 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2Int8x16", + name: "Permute2Uint8x16", argLen: 3, generic: true, }, { - name: "Permute2Uint8x16", + name: "Permute2Int8x16", argLen: 3, generic: true, }, @@ -68643,22 +68571,22 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2MaskedInt8x32", + name: "Permute2MaskedUint8x32", argLen: 4, generic: true, }, { - name: "Permute2MaskedUint8x32", + name: "Permute2MaskedInt8x32", argLen: 4, generic: true, }, { - name: "PermuteMaskedInt8x32", + name: "PermuteMaskedUint8x32", argLen: 3, generic: true, }, { - name: "PermuteMaskedUint8x32", + name: "PermuteMaskedInt8x32", argLen: 3, generic: true, }, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 53dffe10e4..5c7cafd6f2 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -4297,21 +4297,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v) case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8: return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v) - case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16: - return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16(v) - case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4: - return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4(v) - case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8: - return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8(v) - case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16: - v.Op = OpAMD64VPDPBUSDS512 - return true - case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4: - v.Op = OpAMD64VPDPBUSDS128 - return true - case OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8: - v.Op = OpAMD64VPDPBUSDS256 - return true case OpSelect0: return rewriteValueAMD64_OpSelect0(v) case OpSelect1: @@ -5416,21 +5401,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v) case OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8: return rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v) - case OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x16: - return rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x16(v) - case OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x4: - return rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x4(v) - case OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x8: - return rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x8(v) - case OpUnsignedSignedQuadDotProdAccumulateUint32x16: - v.Op = OpAMD64VPDPBUSD512 - return true - case OpUnsignedSignedQuadDotProdAccumulateUint32x4: - v.Op = OpAMD64VPDPBUSD128 - return true - case OpUnsignedSignedQuadDotProdAccumulateUint32x8: - v.Op = OpAMD64VPDPBUSD256 - return true case OpWB: v.Op = OpAMD64LoweredWB return true @@ -49615,66 +49585,6 @@ func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32 return true } } -func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16 x y z mask) - // result: (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4 x y z mask) - // result: (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8 x y z mask) - // result: (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} func rewriteValueAMD64_OpSelect0(v *Value) bool { v_0 := v.Args[0] b := v.Block @@ -53973,66 +53883,6 @@ func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v *Val return true } } -func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (UnsignedSignedQuadDotProdAccumulateMaskedUint32x16 x y z mask) - // result: (VPDPBUSDMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (UnsignedSignedQuadDotProdAccumulateMaskedUint32x4 x y z mask) - // result: (VPDPBUSDMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (UnsignedSignedQuadDotProdAccumulateMaskedUint32x8 x y z mask) - // result: (VPDPBUSDMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPBUSDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} func rewriteValueAMD64_OpXorMaskedInt32x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index fd7ebb20a3..337f0b86e6 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1634,6 +1634,12 @@ func opLen3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa } } +func opLen3_31(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue3(op, t, args[2], args[1], args[0]) + } +} + func opLen3_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue3(op, t, args[1], args[0], args[2]) @@ -1658,6 +1664,12 @@ func opLen4_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args [] } } +func opLen4_31(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return s.newValue4(op, t, args[2], args[1], args[0], args[3]) + } +} + func plainPanicSimdImm(s *state) { cmp := s.newValue0(ssa.OpConstBool, types.Types[types.TBOOL]) cmp.AuxInt = 0 diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 1472f5ec1a..3d92949908 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -993,12 +993,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int16x8.PairDotProd", opLen2(ssa.OpPairDotProdInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.PairDotProd", opLen2(ssa.OpPairDotProdInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x32.PairDotProd", opLen2(ssa.OpPairDotProdInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.PairDotProdAccumulate", opLen3(ssa.OpPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.PairDotProdAccumulateMasked", opLen4(ssa.OpPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.PairDotProdAccumulateMasked", opLen4(ssa.OpPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.PairDotProdAccumulateMasked", opLen4(ssa.OpPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64) @@ -1318,12 +1318,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.SaturatedPairDotProdAccumulate", opLen3(ssa.OpSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.SaturatedPairDotProdAccumulateMasked", opLen4(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SaturatedPairDotProdAccumulateMasked", opLen4(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.SaturatedPairDotProdAccumulateMasked", opLen4(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x8.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x8, types.TypeVec128), sys.AMD64) @@ -1358,18 +1358,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint8x16.SaturatedUnsignedSignedPairDotProdMasked", opLen3(ssa.OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.SaturatedUnsignedSignedPairDotProdMasked", opLen3(ssa.OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x64.SaturatedUnsignedSignedPairDotProdMasked", opLen3(ssa.OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x4.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x4.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x16.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x8.Set128", opLen2Imm8(ssa.OpSet128Float32x8, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Float64x4.Set128", opLen2Imm8(ssa.OpSet128Float64x4, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Int8x32.Set128", opLen2Imm8(ssa.OpSet128Int8x32, types.TypeVec256, 0), sys.AMD64) @@ -1770,18 +1764,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) addF(simdPackage, "Float64x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) addF(simdPackage, "Float64x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Int32x4.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x4.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x16.UnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.UnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.UnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.UnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x4.UnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.UnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x16.UnsignedSignedQuadDotProdAccumulateMasked", opLen4(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.UnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.UnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.Xor", opLen2(ssa.OpXorInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Xor", opLen2(ssa.OpXorInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x8.Xor", opLen2(ssa.OpXorInt16x8, types.TypeVec128), sys.AMD64) diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 3b87836962..4624105d79 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -2115,192 +2115,192 @@ func (x Float64x8) FloorWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 /* FusedMultiplyAdd */ -// FusedMultiplyAdd performs `(v1 * v2) + v3`. +// FusedMultiplyAdd performs (x * y) + z. // // Asm: VFMADD213PS, CPU Feature: AVX512F func (x Float32x4) FusedMultiplyAdd(y Float32x4, z Float32x4) Float32x4 -// FusedMultiplyAdd performs `(v1 * v2) + v3`. +// FusedMultiplyAdd performs (x * y) + z. // // Asm: VFMADD213PS, CPU Feature: AVX512F func (x Float32x8) FusedMultiplyAdd(y Float32x8, z Float32x8) Float32x8 -// FusedMultiplyAdd performs `(v1 * v2) + v3`. +// FusedMultiplyAdd performs (x * y) + z. // // Asm: VFMADD213PS, CPU Feature: AVX512F func (x Float32x16) FusedMultiplyAdd(y Float32x16, z Float32x16) Float32x16 -// FusedMultiplyAdd performs `(v1 * v2) + v3`. +// FusedMultiplyAdd performs (x * y) + z. // // Asm: VFMADD213PD, CPU Feature: AVX512F func (x Float64x2) FusedMultiplyAdd(y Float64x2, z Float64x2) Float64x2 -// FusedMultiplyAdd performs `(v1 * v2) + v3`. +// FusedMultiplyAdd performs (x * y) + z. // // Asm: VFMADD213PD, CPU Feature: AVX512F func (x Float64x4) FusedMultiplyAdd(y Float64x4, z Float64x4) Float64x4 -// FusedMultiplyAdd performs `(v1 * v2) + v3`. +// FusedMultiplyAdd performs (x * y) + z. // // Asm: VFMADD213PD, CPU Feature: AVX512F func (x Float64x8) FusedMultiplyAdd(y Float64x8, z Float64x8) Float64x8 /* FusedMultiplyAddMasked */ -// FusedMultiplyAddMasked performs `(v1 * v2) + v3`. +// FusedMultiplyAddMasked performs (x * y) + z. // // Asm: VFMADD213PS, CPU Feature: AVX512F func (x Float32x4) FusedMultiplyAddMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 -// FusedMultiplyAddMasked performs `(v1 * v2) + v3`. +// FusedMultiplyAddMasked performs (x * y) + z. // // Asm: VFMADD213PS, CPU Feature: AVX512F func (x Float32x8) FusedMultiplyAddMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 -// FusedMultiplyAddMasked performs `(v1 * v2) + v3`. +// FusedMultiplyAddMasked performs (x * y) + z. // // Asm: VFMADD213PS, CPU Feature: AVX512F func (x Float32x16) FusedMultiplyAddMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 -// FusedMultiplyAddMasked performs `(v1 * v2) + v3`. +// FusedMultiplyAddMasked performs (x * y) + z. // // Asm: VFMADD213PD, CPU Feature: AVX512F func (x Float64x2) FusedMultiplyAddMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 -// FusedMultiplyAddMasked performs `(v1 * v2) + v3`. +// FusedMultiplyAddMasked performs (x * y) + z. // // Asm: VFMADD213PD, CPU Feature: AVX512F func (x Float64x4) FusedMultiplyAddMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 -// FusedMultiplyAddMasked performs `(v1 * v2) + v3`. +// FusedMultiplyAddMasked performs (x * y) + z. // // Asm: VFMADD213PD, CPU Feature: AVX512F func (x Float64x8) FusedMultiplyAddMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 /* FusedMultiplyAddSub */ -// FusedMultiplyAddSub performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PS, CPU Feature: AVX512F func (x Float32x4) FusedMultiplyAddSub(y Float32x4, z Float32x4) Float32x4 -// FusedMultiplyAddSub performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PS, CPU Feature: AVX512F func (x Float32x8) FusedMultiplyAddSub(y Float32x8, z Float32x8) Float32x8 -// FusedMultiplyAddSub performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PS, CPU Feature: AVX512F func (x Float32x16) FusedMultiplyAddSub(y Float32x16, z Float32x16) Float32x16 -// FusedMultiplyAddSub performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PD, CPU Feature: AVX512F func (x Float64x2) FusedMultiplyAddSub(y Float64x2, z Float64x2) Float64x2 -// FusedMultiplyAddSub performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PD, CPU Feature: AVX512F func (x Float64x4) FusedMultiplyAddSub(y Float64x4, z Float64x4) Float64x4 -// FusedMultiplyAddSub performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSub performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PD, CPU Feature: AVX512F func (x Float64x8) FusedMultiplyAddSub(y Float64x8, z Float64x8) Float64x8 /* FusedMultiplyAddSubMasked */ -// FusedMultiplyAddSubMasked performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PS, CPU Feature: AVX512F func (x Float32x4) FusedMultiplyAddSubMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 -// FusedMultiplyAddSubMasked performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PS, CPU Feature: AVX512F func (x Float32x8) FusedMultiplyAddSubMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 -// FusedMultiplyAddSubMasked performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PS, CPU Feature: AVX512F func (x Float32x16) FusedMultiplyAddSubMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 -// FusedMultiplyAddSubMasked performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PD, CPU Feature: AVX512F func (x Float64x2) FusedMultiplyAddSubMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 -// FusedMultiplyAddSubMasked performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PD, CPU Feature: AVX512F func (x Float64x4) FusedMultiplyAddSubMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 -// FusedMultiplyAddSubMasked performs `(v1 * v2) - v3` for odd-indexed elements, and `(v1 * v2) + v3` for even-indexed elements. +// FusedMultiplyAddSubMasked performs (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements. // // Asm: VFMADDSUB213PD, CPU Feature: AVX512F func (x Float64x8) FusedMultiplyAddSubMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 /* FusedMultiplySubAdd */ -// FusedMultiplySubAdd performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PS, CPU Feature: AVX512F func (x Float32x4) FusedMultiplySubAdd(y Float32x4, z Float32x4) Float32x4 -// FusedMultiplySubAdd performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PS, CPU Feature: AVX512F func (x Float32x8) FusedMultiplySubAdd(y Float32x8, z Float32x8) Float32x8 -// FusedMultiplySubAdd performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PS, CPU Feature: AVX512F func (x Float32x16) FusedMultiplySubAdd(y Float32x16, z Float32x16) Float32x16 -// FusedMultiplySubAdd performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PD, CPU Feature: AVX512F func (x Float64x2) FusedMultiplySubAdd(y Float64x2, z Float64x2) Float64x2 -// FusedMultiplySubAdd performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PD, CPU Feature: AVX512F func (x Float64x4) FusedMultiplySubAdd(y Float64x4, z Float64x4) Float64x4 -// FusedMultiplySubAdd performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAdd performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PD, CPU Feature: AVX512F func (x Float64x8) FusedMultiplySubAdd(y Float64x8, z Float64x8) Float64x8 /* FusedMultiplySubAddMasked */ -// FusedMultiplySubAddMasked performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PS, CPU Feature: AVX512F func (x Float32x4) FusedMultiplySubAddMasked(y Float32x4, z Float32x4, mask Mask32x4) Float32x4 -// FusedMultiplySubAddMasked performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PS, CPU Feature: AVX512F func (x Float32x8) FusedMultiplySubAddMasked(y Float32x8, z Float32x8, mask Mask32x8) Float32x8 -// FusedMultiplySubAddMasked performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PS, CPU Feature: AVX512F func (x Float32x16) FusedMultiplySubAddMasked(y Float32x16, z Float32x16, mask Mask32x16) Float32x16 -// FusedMultiplySubAddMasked performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PD, CPU Feature: AVX512F func (x Float64x2) FusedMultiplySubAddMasked(y Float64x2, z Float64x2, mask Mask64x2) Float64x2 -// FusedMultiplySubAddMasked performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PD, CPU Feature: AVX512F func (x Float64x4) FusedMultiplySubAddMasked(y Float64x4, z Float64x4, mask Mask64x4) Float64x4 -// FusedMultiplySubAddMasked performs `(v1 * v2) + v3` for odd-indexed elements, and `(v1 * v2) - v3` for even-indexed elements. +// FusedMultiplySubAddMasked performs (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements. // // Asm: VFMSUBADD213PD, CPU Feature: AVX512F func (x Float64x8) FusedMultiplySubAddMasked(y Float64x8, z Float64x8, mask Mask64x8) Float64x8 @@ -5373,37 +5373,37 @@ func (x Int16x32) PairDotProd(y Int16x32) Int32x16 /* PairDotProdAccumulate */ -// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSD, CPU Feature: AVXVNNI -func (x Int32x4) PairDotProdAccumulate(y Int16x8, z Int16x8) Int32x4 +func (x Int16x8) PairDotProdAccumulate(y Int16x8, z Int32x4) Int32x4 -// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSD, CPU Feature: AVXVNNI -func (x Int32x8) PairDotProdAccumulate(y Int16x16, z Int16x16) Int32x8 +func (x Int16x16) PairDotProdAccumulate(y Int16x16, z Int32x8) Int32x8 -// PairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int32x16) PairDotProdAccumulate(y Int16x32, z Int16x32) Int32x16 +func (x Int16x32) PairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16 /* PairDotProdAccumulateMasked */ -// PairDotProdAccumulateMasked performs dot products on pairs of elements of y and z and accumulates the results to x. +// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int32x4) PairDotProdAccumulateMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4 +func (x Int16x8) PairDotProdAccumulateMasked(y Int16x8, z Int32x4, mask Mask32x4) Int32x4 -// PairDotProdAccumulateMasked performs dot products on pairs of elements of y and z and accumulates the results to x. +// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int32x8) PairDotProdAccumulateMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8 +func (x Int16x16) PairDotProdAccumulateMasked(y Int16x16, z Int32x8, mask Mask32x8) Int32x8 -// PairDotProdAccumulateMasked performs dot products on pairs of elements of y and z and accumulates the results to x. +// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int32x16) PairDotProdAccumulateMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16 +func (x Int16x32) PairDotProdAccumulateMasked(y Int16x32, z Int32x16, mask Mask32x16) Int32x16 /* PairDotProdMasked */ @@ -7469,37 +7469,37 @@ func (x Uint16x32) SaturatedAddMasked(y Uint16x32, mask Mask16x32) Uint16x32 /* SaturatedPairDotProdAccumulate */ -// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSDS, CPU Feature: AVXVNNI -func (x Int32x4) SaturatedPairDotProdAccumulate(y Int16x8, z Int16x8) Int32x4 +func (x Int16x8) SaturatedPairDotProdAccumulate(y Int16x8, z Int32x4) Int32x4 -// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSDS, CPU Feature: AVXVNNI -func (x Int32x8) SaturatedPairDotProdAccumulate(y Int16x16, z Int16x16) Int32x8 +func (x Int16x16) SaturatedPairDotProdAccumulate(y Int16x16, z Int32x8) Int32x8 -// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of y and z and accumulates the results to x. +// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x16) SaturatedPairDotProdAccumulate(y Int16x32, z Int16x32) Int32x16 +func (x Int16x32) SaturatedPairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16 /* SaturatedPairDotProdAccumulateMasked */ -// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of y and z and accumulates the results to x. +// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x4) SaturatedPairDotProdAccumulateMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4 +func (x Int16x8) SaturatedPairDotProdAccumulateMasked(y Int16x8, z Int32x4, mask Mask32x4) Int32x4 -// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of y and z and accumulates the results to x. +// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x8) SaturatedPairDotProdAccumulateMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8 +func (x Int16x16) SaturatedPairDotProdAccumulateMasked(y Int16x16, z Int32x8, mask Mask32x8) Int32x8 -// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of y and z and accumulates the results to x. +// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. // // Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x16) SaturatedPairDotProdAccumulateMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16 +func (x Int16x32) SaturatedPairDotProdAccumulateMasked(y Int16x32, z Int32x16, mask Mask32x16) Int32x16 /* SaturatedPairwiseAdd */ @@ -7695,67 +7695,37 @@ func (x Uint8x64) SaturatedUnsignedSignedPairDotProdMasked(y Int8x64, mask Mask1 /* SaturatedUnsignedSignedQuadDotProdAccumulate */ -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVXVNNI -func (x Int32x4) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int8x16) Int32x4 +func (x Int8x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4) Int32x4 -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVXVNNI -func (x Int32x8) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int8x32) Int32x8 +func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8) Int32x8 -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int32x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int8x64) Int32x16 - -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSDS, CPU Feature: AVXVNNI -func (x Uint32x4) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int8x16) Uint32x4 - -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSDS, CPU Feature: AVXVNNI -func (x Uint32x8) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int8x32) Uint32x8 - -// SaturatedUnsignedSignedQuadDotProdAccumulate multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Uint32x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int8x64) Uint32x16 +func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16 /* SaturatedUnsignedSignedQuadDotProdAccumulateMasked */ -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int32x4) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int8x16, mask Mask32x4) Int32x4 - -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int32x8) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int8x32, mask Mask32x8) Int32x8 - -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int32x16) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int8x64, mask Mask32x16) Int32x16 - -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Uint32x4) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int8x16, mask Mask32x4) Uint32x4 +func (x Int8x16) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int32x4, mask Mask32x4) Int32x4 -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Uint32x8) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int8x32, mask Mask32x8) Uint32x8 +func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int32x8, mask Mask32x8) Int32x8 -// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// SaturatedUnsignedSignedQuadDotProdAccumulateMasked multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Uint32x16) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int8x64, mask Mask32x16) Uint32x16 +func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16 /* Set128 */ @@ -10165,67 +10135,37 @@ func (x Float64x8) TruncWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 /* UnsignedSignedQuadDotProdAccumulate */ -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVXVNNI -func (x Int32x4) UnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int8x16) Int32x4 +func (x Int8x16) UnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int32x4) Int32x4 -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVXVNNI -func (x Int32x8) UnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int8x32) Int32x8 +func (x Int8x32) UnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int32x8) Int32x8 -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int32x16) UnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int8x64) Int32x16 - -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSD, CPU Feature: AVXVNNI -func (x Uint32x4) UnsignedSignedQuadDotProdAccumulate(y Uint8x16, z Int8x16) Uint32x4 - -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSD, CPU Feature: AVXVNNI -func (x Uint32x8) UnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int8x32) Uint32x8 - -// UnsignedSignedQuadDotProdAccumulate performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Uint32x16) UnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int8x64) Uint32x16 +func (x Int8x64) UnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int32x16) Int32x16 /* UnsignedSignedQuadDotProdAccumulateMasked */ -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int32x4) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int8x16, mask Mask32x4) Int32x4 - -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int32x8) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int8x32, mask Mask32x8) Int32x8 - -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of y and z and accumulates the results to x. -// -// Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int32x16) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int8x64, mask Mask32x16) Int32x16 - -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Uint32x4) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int8x16, mask Mask32x4) Uint32x4 +func (x Int8x16) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x16, z Int32x4, mask Mask32x4) Int32x4 -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Uint32x8) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int8x32, mask Mask32x8) Uint32x8 +func (x Int8x32) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, z Int32x8, mask Mask32x8) Int32x8 -// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of y and z and accumulates the results to x. +// UnsignedSignedQuadDotProdAccumulateMasked performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Uint32x16) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int8x64, mask Mask32x16) Uint32x16 +func (x Int8x64) UnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16 /* Xor */ diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index d19889cc76..14e5fe3179 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -202,6 +202,25 @@ func TestAndNot(t *testing.T) { []int32{0b10, 0b00, 0b10, 0b00}, "AndNot") } +func TestPairDotProdAccumulate(t *testing.T) { + if !simd.HasAVX512GFNI() { + // TODO: this function is actually VNNI, let's implement and call the right check. + t.Skip("Test requires HasAVX512GFNI, not available on this hardware") + return + } + x := simd.LoadInt16x8Slice([]int16{2, 2, 2, 2, 2, 2, 2, 2}) + z := simd.LoadInt32x4Slice([]int32{3, 3, 3, 3}) + want := []int32{11, 11, 11, 11} + got := make([]int32, 4) + z = x.PairDotProdAccumulate(x, z) + z.StoreSlice(got) + for i := range 4 { + if got[i] != want[i] { + t.Errorf("a and b differ at index %d, got=%d, want=%d", i, got[i], want[i]) + } + } +} + // checkInt8Slices ensures that b and a are equal, to the end of b. // also serves to use the slices, to prevent accidental optimization. func checkInt8Slices(t *testing.T, a, b []int8) { diff --git a/src/simd/simd_wrapped_test.go b/src/simd/simd_wrapped_test.go index 8f0fb665be..d46c05e529 100644 --- a/src/simd/simd_wrapped_test.go +++ b/src/simd/simd_wrapped_test.go @@ -3294,55 +3294,6 @@ func testInt32x4Compare(t *testing.T, v0 []int32, v1 []int32, want []int32, whic } } -func testInt32x4Int16x8Int16x8Int32x4(t *testing.T, v0 []int32, v1 []int16, v2 []int16, want []int32, which string) { - t.Helper() - var gotv simd.Int32x4 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x4Slice(v0) - vec1 := simd.LoadInt16x8Slice(v1) - vec2 := simd.LoadInt16x8Slice(v2) - switch which { - case "PairDotProdAccumulate": - gotv = vec0.PairDotProdAccumulate(vec1, vec2) - case "SaturatedPairDotProdAccumulate": - gotv = vec0.SaturatedPairDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Int32x4.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testInt32x4Int16x8Int16x8Mask32x4Int32x4(t *testing.T, v0 []int32, v1 []int16, v2 []int16, v3 []int32, want []int32, which string) { - t.Helper() - var gotv simd.Int32x4 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x4Slice(v0) - vec1 := simd.LoadInt16x8Slice(v1) - vec2 := simd.LoadInt16x8Slice(v2) - vec3 := simd.LoadInt32x4Slice(v3) - switch which { - case "PairDotProdAccumulateMasked": - gotv = vec0.PairDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x4()) - case "SaturatedPairDotProdAccumulateMasked": - gotv = vec0.SaturatedPairDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x4()) - - default: - t.Errorf("Unknown method: Int32x4.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testInt32x4Mask32x4Int32x4(t *testing.T, v0 []int32, v1 []int32, want []int32, which string) { t.Helper() var gotv simd.Int32x4 @@ -3445,55 +3396,6 @@ func testInt32x4TernaryMasked(t *testing.T, v0 []int32, v1 []int32, v2 []int32, } } -func testInt32x4Uint8x16Int8x16Int32x4(t *testing.T, v0 []int32, v1 []uint8, v2 []int8, want []int32, which string) { - t.Helper() - var gotv simd.Int32x4 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x4Slice(v0) - vec1 := simd.LoadUint8x16Slice(v1) - vec2 := simd.LoadInt8x16Slice(v2) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulate": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulate(vec1, vec2) - case "UnsignedSignedQuadDotProdAccumulate": - gotv = vec0.UnsignedSignedQuadDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Int32x4.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testInt32x4Uint8x16Int8x16Mask32x4Int32x4(t *testing.T, v0 []int32, v1 []uint8, v2 []int8, v3 []int32, want []int32, which string) { - t.Helper() - var gotv simd.Int32x4 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x4Slice(v0) - vec1 := simd.LoadUint8x16Slice(v1) - vec2 := simd.LoadInt8x16Slice(v2) - vec3 := simd.LoadInt32x4Slice(v3) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x4()) - case "UnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.UnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x4()) - - default: - t.Errorf("Unknown method: Int32x4.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testInt32x4Unary(t *testing.T, v0 []int32, want []int32, which string) { t.Helper() var gotv simd.Int32x4 @@ -3688,55 +3590,6 @@ func testInt32x8Compare(t *testing.T, v0 []int32, v1 []int32, want []int32, whic } } -func testInt32x8Int16x16Int16x16Int32x8(t *testing.T, v0 []int32, v1 []int16, v2 []int16, want []int32, which string) { - t.Helper() - var gotv simd.Int32x8 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x8Slice(v0) - vec1 := simd.LoadInt16x16Slice(v1) - vec2 := simd.LoadInt16x16Slice(v2) - switch which { - case "PairDotProdAccumulate": - gotv = vec0.PairDotProdAccumulate(vec1, vec2) - case "SaturatedPairDotProdAccumulate": - gotv = vec0.SaturatedPairDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Int32x8.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testInt32x8Int16x16Int16x16Mask32x8Int32x8(t *testing.T, v0 []int32, v1 []int16, v2 []int16, v3 []int32, want []int32, which string) { - t.Helper() - var gotv simd.Int32x8 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x8Slice(v0) - vec1 := simd.LoadInt16x16Slice(v1) - vec2 := simd.LoadInt16x16Slice(v2) - vec3 := simd.LoadInt32x8Slice(v3) - switch which { - case "PairDotProdAccumulateMasked": - gotv = vec0.PairDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x8()) - case "SaturatedPairDotProdAccumulateMasked": - gotv = vec0.SaturatedPairDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x8()) - - default: - t.Errorf("Unknown method: Int32x8.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testInt32x8Mask32x8Int32x8(t *testing.T, v0 []int32, v1 []int32, want []int32, which string) { t.Helper() var gotv simd.Int32x8 @@ -3839,55 +3692,6 @@ func testInt32x8TernaryMasked(t *testing.T, v0 []int32, v1 []int32, v2 []int32, } } -func testInt32x8Uint8x32Int8x32Int32x8(t *testing.T, v0 []int32, v1 []uint8, v2 []int8, want []int32, which string) { - t.Helper() - var gotv simd.Int32x8 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x8Slice(v0) - vec1 := simd.LoadUint8x32Slice(v1) - vec2 := simd.LoadInt8x32Slice(v2) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulate": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulate(vec1, vec2) - case "UnsignedSignedQuadDotProdAccumulate": - gotv = vec0.UnsignedSignedQuadDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Int32x8.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testInt32x8Uint8x32Int8x32Mask32x8Int32x8(t *testing.T, v0 []int32, v1 []uint8, v2 []int8, v3 []int32, want []int32, which string) { - t.Helper() - var gotv simd.Int32x8 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x8Slice(v0) - vec1 := simd.LoadUint8x32Slice(v1) - vec2 := simd.LoadInt8x32Slice(v2) - vec3 := simd.LoadInt32x8Slice(v3) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x8()) - case "UnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.UnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x8()) - - default: - t.Errorf("Unknown method: Int32x8.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testInt32x8Unary(t *testing.T, v0 []int32, want []int32, which string) { t.Helper() var gotv simd.Int32x8 @@ -4055,55 +3859,6 @@ func testInt32x16Compare(t *testing.T, v0 []int32, v1 []int32, want []int32, whi } } -func testInt32x16Int16x32Int16x32Int32x16(t *testing.T, v0 []int32, v1 []int16, v2 []int16, want []int32, which string) { - t.Helper() - var gotv simd.Int32x16 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x16Slice(v0) - vec1 := simd.LoadInt16x32Slice(v1) - vec2 := simd.LoadInt16x32Slice(v2) - switch which { - case "PairDotProdAccumulate": - gotv = vec0.PairDotProdAccumulate(vec1, vec2) - case "SaturatedPairDotProdAccumulate": - gotv = vec0.SaturatedPairDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Int32x16.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testInt32x16Int16x32Int16x32Mask32x16Int32x16(t *testing.T, v0 []int32, v1 []int16, v2 []int16, v3 []int32, want []int32, which string) { - t.Helper() - var gotv simd.Int32x16 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x16Slice(v0) - vec1 := simd.LoadInt16x32Slice(v1) - vec2 := simd.LoadInt16x32Slice(v2) - vec3 := simd.LoadInt32x16Slice(v3) - switch which { - case "PairDotProdAccumulateMasked": - gotv = vec0.PairDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x16()) - case "SaturatedPairDotProdAccumulateMasked": - gotv = vec0.SaturatedPairDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x16()) - - default: - t.Errorf("Unknown method: Int32x16.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testInt32x16Mask32x16Int32x16(t *testing.T, v0 []int32, v1 []int32, want []int32, which string) { t.Helper() var gotv simd.Int32x16 @@ -4206,55 +3961,6 @@ func testInt32x16TernaryMasked(t *testing.T, v0 []int32, v1 []int32, v2 []int32, } } -func testInt32x16Uint8x64Int8x64Int32x16(t *testing.T, v0 []int32, v1 []uint8, v2 []int8, want []int32, which string) { - t.Helper() - var gotv simd.Int32x16 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x16Slice(v0) - vec1 := simd.LoadUint8x64Slice(v1) - vec2 := simd.LoadInt8x64Slice(v2) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulate": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulate(vec1, vec2) - case "UnsignedSignedQuadDotProdAccumulate": - gotv = vec0.UnsignedSignedQuadDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Int32x16.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testInt32x16Uint8x64Int8x64Mask32x16Int32x16(t *testing.T, v0 []int32, v1 []uint8, v2 []int8, v3 []int32, want []int32, which string) { - t.Helper() - var gotv simd.Int32x16 - got := make([]int32, len(want)) - vec0 := simd.LoadInt32x16Slice(v0) - vec1 := simd.LoadUint8x64Slice(v1) - vec2 := simd.LoadInt8x64Slice(v2) - vec3 := simd.LoadInt32x16Slice(v3) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x16()) - case "UnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.UnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x16()) - - default: - t.Errorf("Unknown method: Int32x16.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testInt32x16Unary(t *testing.T, v0 []int32, want []int32, which string) { t.Helper() var gotv simd.Int32x16 @@ -6880,55 +6586,6 @@ func testUint32x4TernaryMasked(t *testing.T, v0 []uint32, v1 []uint32, v2 []uint } } -func testUint32x4Uint8x16Int8x16Mask32x4Uint32x4(t *testing.T, v0 []uint32, v1 []uint8, v2 []int8, v3 []int32, want []uint32, which string) { - t.Helper() - var gotv simd.Uint32x4 - got := make([]uint32, len(want)) - vec0 := simd.LoadUint32x4Slice(v0) - vec1 := simd.LoadUint8x16Slice(v1) - vec2 := simd.LoadInt8x16Slice(v2) - vec3 := simd.LoadInt32x4Slice(v3) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x4()) - case "UnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.UnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x4()) - - default: - t.Errorf("Unknown method: Uint32x4.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testUint32x4Uint8x16Int8x16Uint32x4(t *testing.T, v0 []uint32, v1 []uint8, v2 []int8, want []uint32, which string) { - t.Helper() - var gotv simd.Uint32x4 - got := make([]uint32, len(want)) - vec0 := simd.LoadUint32x4Slice(v0) - vec1 := simd.LoadUint8x16Slice(v1) - vec2 := simd.LoadInt8x16Slice(v2) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulate": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulate(vec1, vec2) - case "UnsignedSignedQuadDotProdAccumulate": - gotv = vec0.UnsignedSignedQuadDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Uint32x4.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testUint32x4Unary(t *testing.T, v0 []uint32, want []uint32, which string) { t.Helper() var gotv simd.Uint32x4 @@ -7215,55 +6872,6 @@ func testUint32x8TernaryMasked(t *testing.T, v0 []uint32, v1 []uint32, v2 []uint } } -func testUint32x8Uint8x32Int8x32Mask32x8Uint32x8(t *testing.T, v0 []uint32, v1 []uint8, v2 []int8, v3 []int32, want []uint32, which string) { - t.Helper() - var gotv simd.Uint32x8 - got := make([]uint32, len(want)) - vec0 := simd.LoadUint32x8Slice(v0) - vec1 := simd.LoadUint8x32Slice(v1) - vec2 := simd.LoadInt8x32Slice(v2) - vec3 := simd.LoadInt32x8Slice(v3) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x8()) - case "UnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.UnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x8()) - - default: - t.Errorf("Unknown method: Uint32x8.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testUint32x8Uint8x32Int8x32Uint32x8(t *testing.T, v0 []uint32, v1 []uint8, v2 []int8, want []uint32, which string) { - t.Helper() - var gotv simd.Uint32x8 - got := make([]uint32, len(want)) - vec0 := simd.LoadUint32x8Slice(v0) - vec1 := simd.LoadUint8x32Slice(v1) - vec2 := simd.LoadInt8x32Slice(v2) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulate": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulate(vec1, vec2) - case "UnsignedSignedQuadDotProdAccumulate": - gotv = vec0.UnsignedSignedQuadDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Uint32x8.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testUint32x8Unary(t *testing.T, v0 []uint32, want []uint32, which string) { t.Helper() var gotv simd.Uint32x8 @@ -7525,55 +7133,6 @@ func testUint32x16TernaryMasked(t *testing.T, v0 []uint32, v1 []uint32, v2 []uin } } -func testUint32x16Uint8x64Int8x64Mask32x16Uint32x16(t *testing.T, v0 []uint32, v1 []uint8, v2 []int8, v3 []int32, want []uint32, which string) { - t.Helper() - var gotv simd.Uint32x16 - got := make([]uint32, len(want)) - vec0 := simd.LoadUint32x16Slice(v0) - vec1 := simd.LoadUint8x64Slice(v1) - vec2 := simd.LoadInt8x64Slice(v2) - vec3 := simd.LoadInt32x16Slice(v3) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x16()) - case "UnsignedSignedQuadDotProdAccumulateMasked": - gotv = vec0.UnsignedSignedQuadDotProdAccumulateMasked(vec1, vec2, vec3.AsMask32x16()) - - default: - t.Errorf("Unknown method: Uint32x16.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - -func testUint32x16Uint8x64Int8x64Uint32x16(t *testing.T, v0 []uint32, v1 []uint8, v2 []int8, want []uint32, which string) { - t.Helper() - var gotv simd.Uint32x16 - got := make([]uint32, len(want)) - vec0 := simd.LoadUint32x16Slice(v0) - vec1 := simd.LoadUint8x64Slice(v1) - vec2 := simd.LoadInt8x64Slice(v2) - switch which { - case "SaturatedUnsignedSignedQuadDotProdAccumulate": - gotv = vec0.SaturatedUnsignedSignedQuadDotProdAccumulate(vec1, vec2) - case "UnsignedSignedQuadDotProdAccumulate": - gotv = vec0.UnsignedSignedQuadDotProdAccumulate(vec1, vec2) - - default: - t.Errorf("Unknown method: Uint32x16.%s", which) - } - gotv.StoreSlice(got) - for i := range len(want) { - if got[i] != want[i] { - t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i]) - } - } -} - func testUint32x16Unary(t *testing.T, v0 []uint32, want []uint32, which string) { t.Helper() var gotv simd.Uint32x16 @@ -8430,6 +7989,8 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6 // GaloisFieldAffineTransformMasked // Get128 // GetElem +// PairDotProdAccumulate +// PairDotProdAccumulateMasked // Permute // Permute2 // Permute2Masked @@ -8440,6 +8001,10 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6 // RotateAllRightMasked // RoundWithPrecision // RoundWithPrecisionMasked +// SaturatedPairDotProdAccumulate +// SaturatedPairDotProdAccumulateMasked +// SaturatedUnsignedSignedQuadDotProdAccumulate +// SaturatedUnsignedSignedQuadDotProdAccumulateMasked // Set128 // SetElem // ShiftAllLeft @@ -8452,3 +8017,5 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6 // ShiftAllRightMasked // TruncWithPrecision // TruncWithPrecisionMasked +// UnsignedSignedQuadDotProdAccumulate +// UnsignedSignedQuadDotProdAccumulateMasked -- 2.52.0