From 4d26d66a49c51b5a7c610c4815322809b31962d9 Mon Sep 17 00:00:00 2001 From: David Chase Date: Mon, 17 Nov 2025 15:31:36 -0500 Subject: [PATCH] [dev.simd] simd: fix signatures for PermuteConstant* methods This moves the packed-immediate methods to package-private, and adds exported versions with four parameters. Rename PermuteConstant to PermuteScalars Rename VPSHUFB Permute to PermuteOrZero Rename Permute2 to ConcatPermute Comments were repaired/enhanced. Modified the generator to support an additional tag "hideMaskMethods : true" to suppress method, intrinsic, generic, and generic translation generation for said mask-modified versions of such methods (this is already true for exported methods). Change-Id: I91e208c1fff1f28ebce4edb4e73d26003715018c Reviewed-on: https://go-review.googlesource.com/c/go/+/721342 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao Reviewed-by: Cherry Mui --- src/cmd/compile/internal/amd64/simdssa.go | 271 ++-- .../compile/internal/ssa/_gen/simdAMD64.rules | 231 ++-- .../compile/internal/ssa/_gen/simdAMD64ops.go | 11 + .../internal/ssa/_gen/simdgenericOps.go | 110 +- src/cmd/compile/internal/ssa/opGen.go | 862 ++++++++----- src/cmd/compile/internal/ssa/rewriteAMD64.go | 1128 +++++++++-------- .../compile/internal/ssagen/simdintrinsics.go | 114 +- src/simd/_gen/simdgen/gen_simdGenericOps.go | 3 + src/simd/_gen/simdgen/gen_simdIntrinsics.go | 3 + src/simd/_gen/simdgen/gen_simdTypes.go | 3 + src/simd/_gen/simdgen/gen_simdrules.go | 3 +- src/simd/_gen/simdgen/godefs.go | 33 +- .../_gen/simdgen/ops/Moves/categories.yaml | 32 +- src/simd/_gen/simdgen/ops/Moves/go.yaml | 156 ++- src/simd/internal/simd_test/simd_test.go | 89 +- src/simd/ops_amd64.go | 848 +++++-------- src/simd/ops_internal_amd64.go | 214 ++++ src/simd/shuffles_amd64.go | 277 ++++ 18 files changed, 2591 insertions(+), 1797 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 3f8ce17972..b70a72b2f8 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -396,7 +396,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPOR256, ssa.OpAMD64VPORD512, ssa.OpAMD64VPORQ512, - ssa.OpAMD64VPSHUFB128, + ssa.OpAMD64VPERMB128, ssa.OpAMD64VPERMB256, ssa.OpAMD64VPERMB512, ssa.OpAMD64VPERMW128, @@ -410,6 +410,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMQ256, ssa.OpAMD64VPERMPD512, ssa.OpAMD64VPERMQ512, + ssa.OpAMD64VPSHUFB128, ssa.OpAMD64VPSHUFB256, ssa.OpAMD64VPSHUFB512, ssa.OpAMD64VPROLVD128, @@ -672,9 +673,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPORQMasked128, ssa.OpAMD64VPORQMasked256, ssa.OpAMD64VPORQMasked512, - ssa.OpAMD64VPSHUFBMasked256, - ssa.OpAMD64VPSHUFBMasked512, - ssa.OpAMD64VPSHUFBMasked128, + ssa.OpAMD64VPERMBMasked128, ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked512, ssa.OpAMD64VPERMWMasked128, @@ -688,6 +687,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMQMasked256, ssa.OpAMD64VPERMPDMasked512, ssa.OpAMD64VPERMQMasked512, + ssa.OpAMD64VPSHUFBMasked256, + ssa.OpAMD64VPSHUFBMasked512, + ssa.OpAMD64VPSHUFBMasked128, ssa.OpAMD64VPROLVDMasked128, ssa.OpAMD64VPROLVDMasked256, ssa.OpAMD64VPROLVDMasked512, @@ -1011,12 +1013,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VEXTRACTF64X4256, ssa.OpAMD64VEXTRACTI128128, ssa.OpAMD64VEXTRACTI64X4256, - ssa.OpAMD64VPSHUFD128, - ssa.OpAMD64VPSHUFD256, - ssa.OpAMD64VPSHUFD512, - ssa.OpAMD64VPSHUFHW128, - ssa.OpAMD64VPSHUFHW256, - ssa.OpAMD64VPSHUFHW512, ssa.OpAMD64VPROLD128, ssa.OpAMD64VPROLD256, ssa.OpAMD64VPROLD512, @@ -1029,6 +1025,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORQ128, ssa.OpAMD64VPRORQ256, ssa.OpAMD64VPRORQ512, + ssa.OpAMD64VPSHUFD128, + ssa.OpAMD64VPSHUFD256, + ssa.OpAMD64VPSHUFD512, + ssa.OpAMD64VPSHUFHW128, + ssa.OpAMD64VPSHUFHW256, + ssa.OpAMD64VPSHUFHW512, + ssa.OpAMD64VPSHUFLW128, + ssa.OpAMD64VPSHUFLW256, + ssa.OpAMD64VPSHUFLW512, ssa.OpAMD64VPSLLW128const, ssa.OpAMD64VPSLLW256const, ssa.OpAMD64VPSLLW512const, @@ -1070,12 +1075,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VREDUCEPDMasked128, ssa.OpAMD64VREDUCEPDMasked256, ssa.OpAMD64VREDUCEPDMasked512, - ssa.OpAMD64VPSHUFDMasked256, - ssa.OpAMD64VPSHUFDMasked512, - ssa.OpAMD64VPSHUFHWMasked256, - ssa.OpAMD64VPSHUFHWMasked512, - ssa.OpAMD64VPSHUFHWMasked128, - ssa.OpAMD64VPSHUFDMasked128, ssa.OpAMD64VPROLDMasked128, ssa.OpAMD64VPROLDMasked256, ssa.OpAMD64VPROLDMasked512, @@ -1088,6 +1087,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORQMasked128, ssa.OpAMD64VPRORQMasked256, ssa.OpAMD64VPRORQMasked512, + ssa.OpAMD64VPSHUFDMasked256, + ssa.OpAMD64VPSHUFDMasked512, + ssa.OpAMD64VPSHUFHWMasked256, + ssa.OpAMD64VPSHUFHWMasked512, + ssa.OpAMD64VPSHUFHWMasked128, + ssa.OpAMD64VPSHUFLWMasked256, + ssa.OpAMD64VPSHUFLWMasked512, + ssa.OpAMD64VPSHUFLWMasked128, + ssa.OpAMD64VPSHUFDMasked128, ssa.OpAMD64VPSLLWMasked128const, ssa.OpAMD64VPSLLWMasked256const, ssa.OpAMD64VPSLLWMasked512const, @@ -1209,6 +1217,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { case ssa.OpAMD64VPDPWSSD128, ssa.OpAMD64VPDPWSSD256, ssa.OpAMD64VPDPWSSD512, + ssa.OpAMD64VPERMI2B128, + ssa.OpAMD64VPERMI2B256, + ssa.OpAMD64VPERMI2B512, + ssa.OpAMD64VPERMI2W128, + ssa.OpAMD64VPERMI2W256, + ssa.OpAMD64VPERMI2W512, + ssa.OpAMD64VPERMI2PS128, + ssa.OpAMD64VPERMI2D128, + ssa.OpAMD64VPERMI2PS256, + ssa.OpAMD64VPERMI2D256, + ssa.OpAMD64VPERMI2PS512, + ssa.OpAMD64VPERMI2D512, + ssa.OpAMD64VPERMI2PD128, + ssa.OpAMD64VPERMI2Q128, + ssa.OpAMD64VPERMI2PD256, + ssa.OpAMD64VPERMI2Q256, + ssa.OpAMD64VPERMI2PD512, + ssa.OpAMD64VPERMI2Q512, ssa.OpAMD64VPDPBUSD128, ssa.OpAMD64VPDPBUSD256, ssa.OpAMD64VPDPBUSD512, @@ -1233,24 +1259,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VFMSUBADD213PD128, ssa.OpAMD64VFMSUBADD213PD256, ssa.OpAMD64VFMSUBADD213PD512, - ssa.OpAMD64VPERMI2B128, - ssa.OpAMD64VPERMI2B256, - ssa.OpAMD64VPERMI2B512, - ssa.OpAMD64VPERMI2W128, - ssa.OpAMD64VPERMI2W256, - ssa.OpAMD64VPERMI2W512, - ssa.OpAMD64VPERMI2PS128, - ssa.OpAMD64VPERMI2D128, - ssa.OpAMD64VPERMI2PS256, - ssa.OpAMD64VPERMI2D256, - ssa.OpAMD64VPERMI2PS512, - ssa.OpAMD64VPERMI2D512, - ssa.OpAMD64VPERMI2PD128, - ssa.OpAMD64VPERMI2Q128, - ssa.OpAMD64VPERMI2PD256, - ssa.OpAMD64VPERMI2Q256, - ssa.OpAMD64VPERMI2PD512, - ssa.OpAMD64VPERMI2Q512, ssa.OpAMD64VPSHLDVW128, ssa.OpAMD64VPSHLDVW256, ssa.OpAMD64VPSHLDVW512, @@ -1316,6 +1324,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPAVGWMasked128Merging, ssa.OpAMD64VPAVGWMasked256Merging, ssa.OpAMD64VPAVGWMasked512Merging, + ssa.OpAMD64VPERMI2BMasked128, + ssa.OpAMD64VPERMI2BMasked256, + ssa.OpAMD64VPERMI2BMasked512, + ssa.OpAMD64VPERMI2WMasked128, + ssa.OpAMD64VPERMI2WMasked256, + ssa.OpAMD64VPERMI2WMasked512, + ssa.OpAMD64VPERMI2PSMasked128, + ssa.OpAMD64VPERMI2DMasked128, + ssa.OpAMD64VPERMI2PSMasked256, + ssa.OpAMD64VPERMI2DMasked256, + ssa.OpAMD64VPERMI2PSMasked512, + ssa.OpAMD64VPERMI2DMasked512, + ssa.OpAMD64VPERMI2PDMasked128, + ssa.OpAMD64VPERMI2QMasked128, + ssa.OpAMD64VPERMI2PDMasked256, + ssa.OpAMD64VPERMI2QMasked256, + ssa.OpAMD64VPERMI2PDMasked512, + ssa.OpAMD64VPERMI2QMasked512, ssa.OpAMD64VPALIGNRMasked256Merging, ssa.OpAMD64VPALIGNRMasked512Merging, ssa.OpAMD64VPALIGNRMasked128Merging, @@ -1451,24 +1477,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPORQMasked128Merging, ssa.OpAMD64VPORQMasked256Merging, ssa.OpAMD64VPORQMasked512Merging, - ssa.OpAMD64VPERMI2BMasked128, - ssa.OpAMD64VPERMI2BMasked256, - ssa.OpAMD64VPERMI2BMasked512, - ssa.OpAMD64VPERMI2WMasked128, - ssa.OpAMD64VPERMI2WMasked256, - ssa.OpAMD64VPERMI2WMasked512, - ssa.OpAMD64VPERMI2PSMasked128, - ssa.OpAMD64VPERMI2DMasked128, - ssa.OpAMD64VPERMI2PSMasked256, - ssa.OpAMD64VPERMI2DMasked256, - ssa.OpAMD64VPERMI2PSMasked512, - ssa.OpAMD64VPERMI2DMasked512, - ssa.OpAMD64VPERMI2PDMasked128, - ssa.OpAMD64VPERMI2QMasked128, - ssa.OpAMD64VPERMI2PDMasked256, - ssa.OpAMD64VPERMI2QMasked256, - ssa.OpAMD64VPERMI2PDMasked512, - ssa.OpAMD64VPERMI2QMasked512, ssa.OpAMD64VPSHUFBMasked256Merging, ssa.OpAMD64VPSHUFBMasked512Merging, ssa.OpAMD64VPSHUFBMasked128Merging, @@ -1819,6 +1827,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { p = simdV21load(s, v) case ssa.OpAMD64VPDPWSSD512load, + ssa.OpAMD64VPERMI2PS128load, + ssa.OpAMD64VPERMI2D128load, + ssa.OpAMD64VPERMI2PS256load, + ssa.OpAMD64VPERMI2D256load, + ssa.OpAMD64VPERMI2PS512load, + ssa.OpAMD64VPERMI2D512load, + ssa.OpAMD64VPERMI2PD128load, + ssa.OpAMD64VPERMI2Q128load, + ssa.OpAMD64VPERMI2PD256load, + ssa.OpAMD64VPERMI2Q256load, + ssa.OpAMD64VPERMI2PD512load, + ssa.OpAMD64VPERMI2Q512load, ssa.OpAMD64VPDPBUSD512load, ssa.OpAMD64VPDPBUSDS512load, ssa.OpAMD64VFMADD213PS128load, @@ -1839,18 +1859,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VFMSUBADD213PD128load, ssa.OpAMD64VFMSUBADD213PD256load, ssa.OpAMD64VFMSUBADD213PD512load, - ssa.OpAMD64VPERMI2PS128load, - ssa.OpAMD64VPERMI2D128load, - ssa.OpAMD64VPERMI2PS256load, - ssa.OpAMD64VPERMI2D256load, - ssa.OpAMD64VPERMI2PS512load, - ssa.OpAMD64VPERMI2D512load, - ssa.OpAMD64VPERMI2PD128load, - ssa.OpAMD64VPERMI2Q128load, - ssa.OpAMD64VPERMI2PD256load, - ssa.OpAMD64VPERMI2Q256load, - ssa.OpAMD64VPERMI2PD512load, - ssa.OpAMD64VPERMI2Q512load, ssa.OpAMD64VPSHLDVD128load, ssa.OpAMD64VPSHLDVD256load, ssa.OpAMD64VPSHLDVD512load, @@ -1868,6 +1876,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { case ssa.OpAMD64VPDPWSSDMasked128load, ssa.OpAMD64VPDPWSSDMasked256load, ssa.OpAMD64VPDPWSSDMasked512load, + ssa.OpAMD64VPERMI2PSMasked128load, + ssa.OpAMD64VPERMI2DMasked128load, + ssa.OpAMD64VPERMI2PSMasked256load, + ssa.OpAMD64VPERMI2DMasked256load, + ssa.OpAMD64VPERMI2PSMasked512load, + ssa.OpAMD64VPERMI2DMasked512load, + ssa.OpAMD64VPERMI2PDMasked128load, + ssa.OpAMD64VPERMI2QMasked128load, + ssa.OpAMD64VPERMI2PDMasked256load, + ssa.OpAMD64VPERMI2QMasked256load, + ssa.OpAMD64VPERMI2PDMasked512load, + ssa.OpAMD64VPERMI2QMasked512load, ssa.OpAMD64VPDPBUSDMasked128load, ssa.OpAMD64VPDPBUSDMasked256load, ssa.OpAMD64VPDPBUSDMasked512load, @@ -1892,18 +1912,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VFMSUBADD213PDMasked128load, ssa.OpAMD64VFMSUBADD213PDMasked256load, ssa.OpAMD64VFMSUBADD213PDMasked512load, - ssa.OpAMD64VPERMI2PSMasked128load, - ssa.OpAMD64VPERMI2DMasked128load, - ssa.OpAMD64VPERMI2PSMasked256load, - ssa.OpAMD64VPERMI2DMasked256load, - ssa.OpAMD64VPERMI2PSMasked512load, - ssa.OpAMD64VPERMI2DMasked512load, - ssa.OpAMD64VPERMI2PDMasked128load, - ssa.OpAMD64VPERMI2QMasked128load, - ssa.OpAMD64VPERMI2PDMasked256load, - ssa.OpAMD64VPERMI2QMasked256load, - ssa.OpAMD64VPERMI2PDMasked512load, - ssa.OpAMD64VPERMI2QMasked512load, ssa.OpAMD64VPSHLDVDMasked128load, ssa.OpAMD64VPSHLDVDMasked256load, ssa.OpAMD64VPSHLDVDMasked512load, @@ -2124,7 +2132,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VREDUCEPD128load, ssa.OpAMD64VREDUCEPD256load, ssa.OpAMD64VREDUCEPD512load, - ssa.OpAMD64VPSHUFD512load, ssa.OpAMD64VPROLD128load, ssa.OpAMD64VPROLD256load, ssa.OpAMD64VPROLD512load, @@ -2137,6 +2144,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORQ128load, ssa.OpAMD64VPRORQ256load, ssa.OpAMD64VPRORQ512load, + ssa.OpAMD64VPSHUFD512load, ssa.OpAMD64VPSLLD512constload, ssa.OpAMD64VPSLLQ512constload, ssa.OpAMD64VPSRLD512constload, @@ -2159,9 +2167,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VREDUCEPDMasked128load, ssa.OpAMD64VREDUCEPDMasked256load, ssa.OpAMD64VREDUCEPDMasked512load, - ssa.OpAMD64VPSHUFDMasked256load, - ssa.OpAMD64VPSHUFDMasked512load, - ssa.OpAMD64VPSHUFDMasked128load, ssa.OpAMD64VPROLDMasked128load, ssa.OpAMD64VPROLDMasked256load, ssa.OpAMD64VPROLDMasked512load, @@ -2174,6 +2179,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORQMasked128load, ssa.OpAMD64VPRORQMasked256load, ssa.OpAMD64VPRORQMasked512load, + ssa.OpAMD64VPSHUFDMasked256load, + ssa.OpAMD64VPSHUFDMasked512load, + ssa.OpAMD64VPSHUFDMasked128load, ssa.OpAMD64VPSLLDMasked128constload, ssa.OpAMD64VPSLLDMasked256constload, ssa.OpAMD64VPSLLDMasked512constload, @@ -2447,12 +2455,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPOPCNTQMasked128Merging, ssa.OpAMD64VPOPCNTQMasked256Merging, ssa.OpAMD64VPOPCNTQMasked512Merging, - ssa.OpAMD64VPSHUFDMasked256Merging, - ssa.OpAMD64VPSHUFDMasked512Merging, - ssa.OpAMD64VPSHUFHWMasked256Merging, - ssa.OpAMD64VPSHUFHWMasked512Merging, - ssa.OpAMD64VPSHUFHWMasked128Merging, - ssa.OpAMD64VPSHUFDMasked128Merging, ssa.OpAMD64VRCP14PSMasked128Merging, ssa.OpAMD64VRCP14PSMasked256Merging, ssa.OpAMD64VRCP14PSMasked512Merging, @@ -2483,6 +2485,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VSQRTPDMasked128Merging, ssa.OpAMD64VSQRTPDMasked256Merging, ssa.OpAMD64VSQRTPDMasked512Merging, + ssa.OpAMD64VPSHUFDMasked256Merging, + ssa.OpAMD64VPSHUFDMasked512Merging, + ssa.OpAMD64VPSHUFHWMasked256Merging, + ssa.OpAMD64VPSHUFHWMasked512Merging, + ssa.OpAMD64VPSHUFHWMasked128Merging, + ssa.OpAMD64VPSHUFLWMasked256Merging, + ssa.OpAMD64VPSHUFLWMasked512Merging, + ssa.OpAMD64VPSHUFLWMasked128Merging, + ssa.OpAMD64VPSHUFDMasked128Merging, ssa.OpAMD64VPSLLWMasked128constMerging, ssa.OpAMD64VPSLLWMasked256constMerging, ssa.OpAMD64VPSLLWMasked512constMerging, @@ -2674,6 +2685,36 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPCOMPRESSQMasked128, ssa.OpAMD64VPCOMPRESSQMasked256, ssa.OpAMD64VPCOMPRESSQMasked512, + ssa.OpAMD64VPERMI2BMasked128, + ssa.OpAMD64VPERMI2BMasked256, + ssa.OpAMD64VPERMI2BMasked512, + ssa.OpAMD64VPERMI2WMasked128, + ssa.OpAMD64VPERMI2WMasked256, + ssa.OpAMD64VPERMI2WMasked512, + ssa.OpAMD64VPERMI2PSMasked128, + ssa.OpAMD64VPERMI2PSMasked128load, + ssa.OpAMD64VPERMI2DMasked128, + ssa.OpAMD64VPERMI2DMasked128load, + ssa.OpAMD64VPERMI2PSMasked256, + ssa.OpAMD64VPERMI2PSMasked256load, + ssa.OpAMD64VPERMI2DMasked256, + ssa.OpAMD64VPERMI2DMasked256load, + ssa.OpAMD64VPERMI2PSMasked512, + ssa.OpAMD64VPERMI2PSMasked512load, + ssa.OpAMD64VPERMI2DMasked512, + ssa.OpAMD64VPERMI2DMasked512load, + ssa.OpAMD64VPERMI2PDMasked128, + ssa.OpAMD64VPERMI2PDMasked128load, + ssa.OpAMD64VPERMI2QMasked128, + ssa.OpAMD64VPERMI2QMasked128load, + ssa.OpAMD64VPERMI2PDMasked256, + ssa.OpAMD64VPERMI2PDMasked256load, + ssa.OpAMD64VPERMI2QMasked256, + ssa.OpAMD64VPERMI2QMasked256load, + ssa.OpAMD64VPERMI2PDMasked512, + ssa.OpAMD64VPERMI2PDMasked512load, + ssa.OpAMD64VPERMI2QMasked512, + ssa.OpAMD64VPERMI2QMasked512load, ssa.OpAMD64VPALIGNRMasked256, ssa.OpAMD64VPALIGNRMasked512, ssa.OpAMD64VPALIGNRMasked128, @@ -3061,48 +3102,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPORQMasked256load, ssa.OpAMD64VPORQMasked512, ssa.OpAMD64VPORQMasked512load, - ssa.OpAMD64VPERMI2BMasked128, - ssa.OpAMD64VPERMI2BMasked256, - ssa.OpAMD64VPERMI2BMasked512, - ssa.OpAMD64VPERMI2WMasked128, - ssa.OpAMD64VPERMI2WMasked256, - ssa.OpAMD64VPERMI2WMasked512, - ssa.OpAMD64VPERMI2PSMasked128, - ssa.OpAMD64VPERMI2PSMasked128load, - ssa.OpAMD64VPERMI2DMasked128, - ssa.OpAMD64VPERMI2DMasked128load, - ssa.OpAMD64VPERMI2PSMasked256, - ssa.OpAMD64VPERMI2PSMasked256load, - ssa.OpAMD64VPERMI2DMasked256, - ssa.OpAMD64VPERMI2DMasked256load, - ssa.OpAMD64VPERMI2PSMasked512, - ssa.OpAMD64VPERMI2PSMasked512load, - ssa.OpAMD64VPERMI2DMasked512, - ssa.OpAMD64VPERMI2DMasked512load, - ssa.OpAMD64VPERMI2PDMasked128, - ssa.OpAMD64VPERMI2PDMasked128load, - ssa.OpAMD64VPERMI2QMasked128, - ssa.OpAMD64VPERMI2QMasked128load, - ssa.OpAMD64VPERMI2PDMasked256, - ssa.OpAMD64VPERMI2PDMasked256load, - ssa.OpAMD64VPERMI2QMasked256, - ssa.OpAMD64VPERMI2QMasked256load, - ssa.OpAMD64VPERMI2PDMasked512, - ssa.OpAMD64VPERMI2PDMasked512load, - ssa.OpAMD64VPERMI2QMasked512, - ssa.OpAMD64VPERMI2QMasked512load, - ssa.OpAMD64VPSHUFDMasked256, - ssa.OpAMD64VPSHUFDMasked256load, - ssa.OpAMD64VPSHUFDMasked512, - ssa.OpAMD64VPSHUFDMasked512load, - ssa.OpAMD64VPSHUFHWMasked256, - ssa.OpAMD64VPSHUFHWMasked512, - ssa.OpAMD64VPSHUFHWMasked128, - ssa.OpAMD64VPSHUFDMasked128, - ssa.OpAMD64VPSHUFDMasked128load, - ssa.OpAMD64VPSHUFBMasked256, - ssa.OpAMD64VPSHUFBMasked512, - ssa.OpAMD64VPSHUFBMasked128, + ssa.OpAMD64VPERMBMasked128, ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked512, ssa.OpAMD64VPERMWMasked128, @@ -3124,6 +3124,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPERMPDMasked512load, ssa.OpAMD64VPERMQMasked512, ssa.OpAMD64VPERMQMasked512load, + ssa.OpAMD64VPSHUFBMasked256, + ssa.OpAMD64VPSHUFBMasked512, + ssa.OpAMD64VPSHUFBMasked128, ssa.OpAMD64VRCP14PSMasked128, ssa.OpAMD64VRCP14PSMasked128load, ssa.OpAMD64VRCP14PSMasked256, @@ -3418,6 +3421,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VMOVDQU64Masked128, ssa.OpAMD64VMOVDQU64Masked256, ssa.OpAMD64VMOVDQU64Masked512, + ssa.OpAMD64VPSHUFDMasked256, + ssa.OpAMD64VPSHUFDMasked256load, + ssa.OpAMD64VPSHUFDMasked512, + ssa.OpAMD64VPSHUFDMasked512load, + ssa.OpAMD64VPSHUFHWMasked256, + ssa.OpAMD64VPSHUFHWMasked512, + ssa.OpAMD64VPSHUFHWMasked128, + ssa.OpAMD64VPSHUFLWMasked256, + ssa.OpAMD64VPSHUFLWMasked512, + ssa.OpAMD64VPSHUFLWMasked128, + ssa.OpAMD64VPSHUFDMasked128, + ssa.OpAMD64VPSHUFDMasked128load, ssa.OpAMD64VPSLLWMasked128const, ssa.OpAMD64VPSLLWMasked256const, ssa.OpAMD64VPSLLWMasked512const, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 5a9a1c0bc7..283a2e53cd 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -216,6 +216,36 @@ (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM mask)) (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM mask)) (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM mask)) +(ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...) +(ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...) +(ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...) +(ConcatPermuteFloat64x2 ...) => (VPERMI2PD128 ...) +(ConcatPermuteFloat64x4 ...) => (VPERMI2PD256 ...) +(ConcatPermuteFloat64x8 ...) => (VPERMI2PD512 ...) +(ConcatPermuteInt8x16 ...) => (VPERMI2B128 ...) +(ConcatPermuteInt8x32 ...) => (VPERMI2B256 ...) +(ConcatPermuteInt8x64 ...) => (VPERMI2B512 ...) +(ConcatPermuteInt16x8 ...) => (VPERMI2W128 ...) +(ConcatPermuteInt16x16 ...) => (VPERMI2W256 ...) +(ConcatPermuteInt16x32 ...) => (VPERMI2W512 ...) +(ConcatPermuteInt32x4 ...) => (VPERMI2D128 ...) +(ConcatPermuteInt32x8 ...) => (VPERMI2D256 ...) +(ConcatPermuteInt32x16 ...) => (VPERMI2D512 ...) +(ConcatPermuteInt64x2 ...) => (VPERMI2Q128 ...) +(ConcatPermuteInt64x4 ...) => (VPERMI2Q256 ...) +(ConcatPermuteInt64x8 ...) => (VPERMI2Q512 ...) +(ConcatPermuteUint8x16 ...) => (VPERMI2B128 ...) +(ConcatPermuteUint8x32 ...) => (VPERMI2B256 ...) +(ConcatPermuteUint8x64 ...) => (VPERMI2B512 ...) +(ConcatPermuteUint16x8 ...) => (VPERMI2W128 ...) +(ConcatPermuteUint16x16 ...) => (VPERMI2W256 ...) +(ConcatPermuteUint16x32 ...) => (VPERMI2W512 ...) +(ConcatPermuteUint32x4 ...) => (VPERMI2D128 ...) +(ConcatPermuteUint32x8 ...) => (VPERMI2D256 ...) +(ConcatPermuteUint32x16 ...) => (VPERMI2D512 ...) +(ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...) +(ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...) +(ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...) (ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...) (ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...) (ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...) @@ -794,7 +824,7 @@ (PermuteFloat32x16 ...) => (VPERMPS512 ...) (PermuteFloat64x4 ...) => (VPERMPD256 ...) (PermuteFloat64x8 ...) => (VPERMPD512 ...) -(PermuteInt8x16 ...) => (VPSHUFB128 ...) +(PermuteInt8x16 ...) => (VPERMB128 ...) (PermuteInt8x32 ...) => (VPERMB256 ...) (PermuteInt8x64 ...) => (VPERMB512 ...) (PermuteInt16x8 ...) => (VPERMW128 ...) @@ -804,7 +834,7 @@ (PermuteInt32x16 ...) => (VPERMD512 ...) (PermuteInt64x4 ...) => (VPERMQ256 ...) (PermuteInt64x8 ...) => (VPERMQ512 ...) -(PermuteUint8x16 ...) => (VPSHUFB128 ...) +(PermuteUint8x16 ...) => (VPERMB128 ...) (PermuteUint8x32 ...) => (VPERMB256 ...) (PermuteUint8x64 ...) => (VPERMB512 ...) (PermuteUint16x8 ...) => (VPERMW128 ...) @@ -814,62 +844,12 @@ (PermuteUint32x16 ...) => (VPERMD512 ...) (PermuteUint64x4 ...) => (VPERMQ256 ...) (PermuteUint64x8 ...) => (VPERMQ512 ...) -(Permute2Float32x4 ...) => (VPERMI2PS128 ...) -(Permute2Float32x8 ...) => (VPERMI2PS256 ...) -(Permute2Float32x16 ...) => (VPERMI2PS512 ...) -(Permute2Float64x2 ...) => (VPERMI2PD128 ...) -(Permute2Float64x4 ...) => (VPERMI2PD256 ...) -(Permute2Float64x8 ...) => (VPERMI2PD512 ...) -(Permute2Int8x16 ...) => (VPERMI2B128 ...) -(Permute2Int8x32 ...) => (VPERMI2B256 ...) -(Permute2Int8x64 ...) => (VPERMI2B512 ...) -(Permute2Int16x8 ...) => (VPERMI2W128 ...) -(Permute2Int16x16 ...) => (VPERMI2W256 ...) -(Permute2Int16x32 ...) => (VPERMI2W512 ...) -(Permute2Int32x4 ...) => (VPERMI2D128 ...) -(Permute2Int32x8 ...) => (VPERMI2D256 ...) -(Permute2Int32x16 ...) => (VPERMI2D512 ...) -(Permute2Int64x2 ...) => (VPERMI2Q128 ...) -(Permute2Int64x4 ...) => (VPERMI2Q256 ...) -(Permute2Int64x8 ...) => (VPERMI2Q512 ...) -(Permute2Uint8x16 ...) => (VPERMI2B128 ...) -(Permute2Uint8x32 ...) => (VPERMI2B256 ...) -(Permute2Uint8x64 ...) => (VPERMI2B512 ...) -(Permute2Uint16x8 ...) => (VPERMI2W128 ...) -(Permute2Uint16x16 ...) => (VPERMI2W256 ...) -(Permute2Uint16x32 ...) => (VPERMI2W512 ...) -(Permute2Uint32x4 ...) => (VPERMI2D128 ...) -(Permute2Uint32x8 ...) => (VPERMI2D256 ...) -(Permute2Uint32x16 ...) => (VPERMI2D512 ...) -(Permute2Uint64x2 ...) => (VPERMI2Q128 ...) -(Permute2Uint64x4 ...) => (VPERMI2Q256 ...) -(Permute2Uint64x8 ...) => (VPERMI2Q512 ...) -(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...) -(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...) -(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...) -(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...) -(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...) -(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...) -(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...) -(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...) -(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...) -(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...) -(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...) -(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...) -(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...) -(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...) -(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...) -(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...) -(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...) -(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...) -(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...) -(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...) -(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...) -(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...) -(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...) -(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...) -(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...) -(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...) +(PermuteOrZeroInt8x16 ...) => (VPSHUFB128 ...) +(PermuteOrZeroUint8x16 ...) => (VPSHUFB128 ...) +(PermuteOrZeroGroupedInt8x32 ...) => (VPSHUFB256 ...) +(PermuteOrZeroGroupedInt8x64 ...) => (VPSHUFB512 ...) +(PermuteOrZeroGroupedUint8x32 ...) => (VPSHUFB256 ...) +(PermuteOrZeroGroupedUint8x64 ...) => (VPSHUFB512 ...) (ReciprocalFloat32x4 ...) => (VRCPPS128 ...) (ReciprocalFloat32x8 ...) => (VRCPPS256 ...) (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...) @@ -1324,6 +1304,24 @@ (concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...) (concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...) (concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...) +(permuteScalarsInt32x4 ...) => (VPSHUFD128 ...) +(permuteScalarsUint32x4 ...) => (VPSHUFD128 ...) +(permuteScalarsGroupedInt32x8 ...) => (VPSHUFD256 ...) +(permuteScalarsGroupedInt32x16 ...) => (VPSHUFD512 ...) +(permuteScalarsGroupedUint32x8 ...) => (VPSHUFD256 ...) +(permuteScalarsGroupedUint32x16 ...) => (VPSHUFD512 ...) +(permuteScalarsHiInt16x8 ...) => (VPSHUFHW128 ...) +(permuteScalarsHiUint16x8 ...) => (VPSHUFHW128 ...) +(permuteScalarsHiGroupedInt16x16 ...) => (VPSHUFHW256 ...) +(permuteScalarsHiGroupedInt16x32 ...) => (VPSHUFHW512 ...) +(permuteScalarsHiGroupedUint16x16 ...) => (VPSHUFHW256 ...) +(permuteScalarsHiGroupedUint16x32 ...) => (VPSHUFHW512 ...) +(permuteScalarsLoInt16x8 ...) => (VPSHUFLW128 ...) +(permuteScalarsLoUint16x8 ...) => (VPSHUFLW128 ...) +(permuteScalarsLoGroupedInt16x16 ...) => (VPSHUFLW256 ...) +(permuteScalarsLoGroupedInt16x32 ...) => (VPSHUFLW512 ...) +(permuteScalarsLoGroupedUint16x16 ...) => (VPSHUFLW256 ...) +(permuteScalarsLoGroupedUint16x32 ...) => (VPSHUFLW512 ...) (ternInt32x4 ...) => (VPTERNLOGD128 ...) (ternInt32x8 ...) => (VPTERNLOGD256 ...) (ternInt32x16 ...) => (VPTERNLOGD512 ...) @@ -1417,6 +1415,24 @@ (VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask) (VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask) (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask) +(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask) +(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask) +(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask) +(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask) +(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask) +(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask) +(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask) +(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask) +(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask) +(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask) +(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask) +(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask) +(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask) +(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask) +(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask) +(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask) +(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask) +(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask) (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask) (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask) (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask) @@ -1668,33 +1684,7 @@ (VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask) (VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask) (VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask) -(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask) -(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask) -(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask) -(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask) -(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask) -(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask) -(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask) -(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask) -(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask) -(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask) -(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask) -(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask) -(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask) -(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask) -(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask) -(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask) -(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask) -(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask) -(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask) -(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask) -(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask) -(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask) -(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask) -(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask) -(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask) -(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask) -(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask) +(VMOVDQU8Masked128 (VPERMB128 x y) mask) => (VPERMBMasked128 x y mask) (VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask) (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask) (VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask) @@ -1708,6 +1698,9 @@ (VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask) (VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask) (VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask) +(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask) +(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask) +(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask) (VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask) (VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask) (VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask) @@ -1874,6 +1867,15 @@ (VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask) (VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask) (VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask) +(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask) +(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask) +(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask) +(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask) +(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask) +(VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) => (VPSHUFLWMasked256 [a] x mask) +(VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512 [a] x mask) +(VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) => (VPSHUFLWMasked128 [a] x mask) +(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask) (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask) (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask) (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask) @@ -2021,6 +2023,7 @@ (VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask) (VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask) (VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask) +(VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask) (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask) (VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask) (VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask) @@ -2170,6 +2173,7 @@ (VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM mask)) (VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM mask)) (VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM mask)) +(VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM mask)) (VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM mask)) (VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM mask)) (VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM mask)) @@ -2305,6 +2309,7 @@ (VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM mask)) (VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM mask)) (VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM mask)) +(VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM mask)) (VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM mask)) (VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM mask)) (VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM mask)) @@ -2410,6 +2415,30 @@ (VREDUCEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VREDUCEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VREDUCEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) +(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem) +(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem) +(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem) +(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem) +(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem) +(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem) +(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem) +(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem) +(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem) +(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem) +(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem) +(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem) +(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem) +(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem) +(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem) +(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem) +(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem) +(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem) +(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem) +(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem) +(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem) +(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem) +(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem) +(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem) (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem) (VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem) (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem) @@ -2636,34 +2665,6 @@ (VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem) (VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem) (VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem) -(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem) -(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem) -(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem) -(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem) -(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem) -(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem) -(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem) -(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem) -(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem) -(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem) -(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem) -(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem) -(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem) -(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem) -(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem) -(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem) -(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem) -(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem) -(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem) -(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem) -(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem) -(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem) -(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem) -(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem) -(VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) -(VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) -(VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) -(VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem) (VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem) (VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem) @@ -2862,6 +2863,10 @@ (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem) (VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem) (VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem) +(VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) +(VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) +(VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) +(VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) (VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) (VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 674cfb19d6..404354d387 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -383,8 +383,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true}, + {name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false}, @@ -1310,6 +1312,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSHUFLW128", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSHUFLW256", argLength: 1, reg: v11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSHUFLW512", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPSHUFLWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPSHUFLWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPSHUFLWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, @@ -2392,6 +2400,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, + {name: "VPSHUFLWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPSHUFLWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPSHUFLWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 6a79fa3856..3fae158c0a 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -207,6 +207,36 @@ func simdGenericOps() []opData { {name: "CompressUint64x2", argLength: 2, commutative: false}, {name: "CompressUint64x4", argLength: 2, commutative: false}, {name: "CompressUint64x8", argLength: 2, commutative: false}, + {name: "ConcatPermuteFloat32x4", argLength: 3, commutative: false}, + {name: "ConcatPermuteFloat32x8", argLength: 3, commutative: false}, + {name: "ConcatPermuteFloat32x16", argLength: 3, commutative: false}, + {name: "ConcatPermuteFloat64x2", argLength: 3, commutative: false}, + {name: "ConcatPermuteFloat64x4", argLength: 3, commutative: false}, + {name: "ConcatPermuteFloat64x8", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt8x16", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt8x32", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt8x64", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt16x8", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt16x16", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt16x32", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt32x4", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt32x8", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt32x16", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt64x2", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt64x4", argLength: 3, commutative: false}, + {name: "ConcatPermuteInt64x8", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint8x16", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint8x32", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint8x64", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint16x8", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint16x16", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint16x32", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint32x4", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint32x8", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint32x16", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint64x2", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint64x4", argLength: 3, commutative: false}, + {name: "ConcatPermuteUint64x8", argLength: 3, commutative: false}, {name: "ConvertToInt8Int16x8", argLength: 1, commutative: false}, {name: "ConvertToInt8Int16x16", argLength: 1, commutative: false}, {name: "ConvertToInt8Int16x32", argLength: 1, commutative: false}, @@ -750,44 +780,10 @@ func simdGenericOps() []opData { {name: "OrUint64x2", argLength: 2, commutative: true}, {name: "OrUint64x4", argLength: 2, commutative: true}, {name: "OrUint64x8", argLength: 2, commutative: true}, - {name: "Permute2Float32x4", argLength: 3, commutative: false}, - {name: "Permute2Float32x8", argLength: 3, commutative: false}, - {name: "Permute2Float32x16", argLength: 3, commutative: false}, - {name: "Permute2Float64x2", argLength: 3, commutative: false}, - {name: "Permute2Float64x4", argLength: 3, commutative: false}, - {name: "Permute2Float64x8", argLength: 3, commutative: false}, - {name: "Permute2Int8x16", argLength: 3, commutative: false}, - {name: "Permute2Int8x32", argLength: 3, commutative: false}, - {name: "Permute2Int8x64", argLength: 3, commutative: false}, - {name: "Permute2Int16x8", argLength: 3, commutative: false}, - {name: "Permute2Int16x16", argLength: 3, commutative: false}, - {name: "Permute2Int16x32", argLength: 3, commutative: false}, - {name: "Permute2Int32x4", argLength: 3, commutative: false}, - {name: "Permute2Int32x8", argLength: 3, commutative: false}, - {name: "Permute2Int32x16", argLength: 3, commutative: false}, - {name: "Permute2Int64x2", argLength: 3, commutative: false}, - {name: "Permute2Int64x4", argLength: 3, commutative: false}, - {name: "Permute2Int64x8", argLength: 3, commutative: false}, - {name: "Permute2Uint8x16", argLength: 3, commutative: false}, - {name: "Permute2Uint8x32", argLength: 3, commutative: false}, - {name: "Permute2Uint8x64", argLength: 3, commutative: false}, - {name: "Permute2Uint16x8", argLength: 3, commutative: false}, - {name: "Permute2Uint16x16", argLength: 3, commutative: false}, - {name: "Permute2Uint16x32", argLength: 3, commutative: false}, - {name: "Permute2Uint32x4", argLength: 3, commutative: false}, - {name: "Permute2Uint32x8", argLength: 3, commutative: false}, - {name: "Permute2Uint32x16", argLength: 3, commutative: false}, - {name: "Permute2Uint64x2", argLength: 3, commutative: false}, - {name: "Permute2Uint64x4", argLength: 3, commutative: false}, - {name: "Permute2Uint64x8", argLength: 3, commutative: false}, {name: "PermuteFloat32x8", argLength: 2, commutative: false}, {name: "PermuteFloat32x16", argLength: 2, commutative: false}, {name: "PermuteFloat64x4", argLength: 2, commutative: false}, {name: "PermuteFloat64x8", argLength: 2, commutative: false}, - {name: "PermuteGroupedInt8x32", argLength: 2, commutative: false}, - {name: "PermuteGroupedInt8x64", argLength: 2, commutative: false}, - {name: "PermuteGroupedUint8x32", argLength: 2, commutative: false}, - {name: "PermuteGroupedUint8x64", argLength: 2, commutative: false}, {name: "PermuteInt8x16", argLength: 2, commutative: false}, {name: "PermuteInt8x32", argLength: 2, commutative: false}, {name: "PermuteInt8x64", argLength: 2, commutative: false}, @@ -798,6 +794,12 @@ func simdGenericOps() []opData { {name: "PermuteInt32x16", argLength: 2, commutative: false}, {name: "PermuteInt64x4", argLength: 2, commutative: false}, {name: "PermuteInt64x8", argLength: 2, commutative: false}, + {name: "PermuteOrZeroGroupedInt8x32", argLength: 2, commutative: false}, + {name: "PermuteOrZeroGroupedInt8x64", argLength: 2, commutative: false}, + {name: "PermuteOrZeroGroupedUint8x32", argLength: 2, commutative: false}, + {name: "PermuteOrZeroGroupedUint8x64", argLength: 2, commutative: false}, + {name: "PermuteOrZeroInt8x16", argLength: 2, commutative: false}, + {name: "PermuteOrZeroUint8x16", argLength: 2, commutative: false}, {name: "PermuteUint8x16", argLength: 2, commutative: false}, {name: "PermuteUint8x32", argLength: 2, commutative: false}, {name: "PermuteUint8x64", argLength: 2, commutative: false}, @@ -1151,28 +1153,6 @@ func simdGenericOps() []opData { {name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, - {name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"}, @@ -1292,6 +1272,24 @@ func simdGenericOps() []opData { {name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "permuteScalarsUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"}, {name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"}, {name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index ea5491362f..fa94dfbbd5 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1624,8 +1624,10 @@ const ( OpAMD64VPDPWSSDMasked128 OpAMD64VPDPWSSDMasked256 OpAMD64VPDPWSSDMasked512 + OpAMD64VPERMB128 OpAMD64VPERMB256 OpAMD64VPERMB512 + OpAMD64VPERMBMasked128 OpAMD64VPERMBMasked256 OpAMD64VPERMBMasked512 OpAMD64VPERMD256 @@ -2551,6 +2553,12 @@ const ( OpAMD64VPSHUFHWMasked128 OpAMD64VPSHUFHWMasked256 OpAMD64VPSHUFHWMasked512 + OpAMD64VPSHUFLW128 + OpAMD64VPSHUFLW256 + OpAMD64VPSHUFLW512 + OpAMD64VPSHUFLWMasked128 + OpAMD64VPSHUFLWMasked256 + OpAMD64VPSHUFLWMasked512 OpAMD64VPSLLD128const OpAMD64VPSLLD256const OpAMD64VPSLLD512const @@ -3633,6 +3641,9 @@ const ( OpAMD64VPSHUFHWMasked128Merging OpAMD64VPSHUFHWMasked256Merging OpAMD64VPSHUFHWMasked512Merging + OpAMD64VPSHUFLWMasked128Merging + OpAMD64VPSHUFLWMasked256Merging + OpAMD64VPSHUFLWMasked512Merging OpAMD64VPSLLDMasked128constMerging OpAMD64VPSLLDMasked256constMerging OpAMD64VPSLLDMasked512constMerging @@ -6155,6 +6166,36 @@ const ( OpCompressUint64x2 OpCompressUint64x4 OpCompressUint64x8 + OpConcatPermuteFloat32x4 + OpConcatPermuteFloat32x8 + OpConcatPermuteFloat32x16 + OpConcatPermuteFloat64x2 + OpConcatPermuteFloat64x4 + OpConcatPermuteFloat64x8 + OpConcatPermuteInt8x16 + OpConcatPermuteInt8x32 + OpConcatPermuteInt8x64 + OpConcatPermuteInt16x8 + OpConcatPermuteInt16x16 + OpConcatPermuteInt16x32 + OpConcatPermuteInt32x4 + OpConcatPermuteInt32x8 + OpConcatPermuteInt32x16 + OpConcatPermuteInt64x2 + OpConcatPermuteInt64x4 + OpConcatPermuteInt64x8 + OpConcatPermuteUint8x16 + OpConcatPermuteUint8x32 + OpConcatPermuteUint8x64 + OpConcatPermuteUint16x8 + OpConcatPermuteUint16x16 + OpConcatPermuteUint16x32 + OpConcatPermuteUint32x4 + OpConcatPermuteUint32x8 + OpConcatPermuteUint32x16 + OpConcatPermuteUint64x2 + OpConcatPermuteUint64x4 + OpConcatPermuteUint64x8 OpConvertToInt8Int16x8 OpConvertToInt8Int16x16 OpConvertToInt8Int16x32 @@ -6698,44 +6739,10 @@ const ( OpOrUint64x2 OpOrUint64x4 OpOrUint64x8 - OpPermute2Float32x4 - OpPermute2Float32x8 - OpPermute2Float32x16 - OpPermute2Float64x2 - OpPermute2Float64x4 - OpPermute2Float64x8 - OpPermute2Int8x16 - OpPermute2Int8x32 - OpPermute2Int8x64 - OpPermute2Int16x8 - OpPermute2Int16x16 - OpPermute2Int16x32 - OpPermute2Int32x4 - OpPermute2Int32x8 - OpPermute2Int32x16 - OpPermute2Int64x2 - OpPermute2Int64x4 - OpPermute2Int64x8 - OpPermute2Uint8x16 - OpPermute2Uint8x32 - OpPermute2Uint8x64 - OpPermute2Uint16x8 - OpPermute2Uint16x16 - OpPermute2Uint16x32 - OpPermute2Uint32x4 - OpPermute2Uint32x8 - OpPermute2Uint32x16 - OpPermute2Uint64x2 - OpPermute2Uint64x4 - OpPermute2Uint64x8 OpPermuteFloat32x8 OpPermuteFloat32x16 OpPermuteFloat64x4 OpPermuteFloat64x8 - OpPermuteGroupedInt8x32 - OpPermuteGroupedInt8x64 - OpPermuteGroupedUint8x32 - OpPermuteGroupedUint8x64 OpPermuteInt8x16 OpPermuteInt8x32 OpPermuteInt8x64 @@ -6746,6 +6753,12 @@ const ( OpPermuteInt32x16 OpPermuteInt64x4 OpPermuteInt64x8 + OpPermuteOrZeroGroupedInt8x32 + OpPermuteOrZeroGroupedInt8x64 + OpPermuteOrZeroGroupedUint8x32 + OpPermuteOrZeroGroupedUint8x64 + OpPermuteOrZeroInt8x16 + OpPermuteOrZeroUint8x16 OpPermuteUint8x16 OpPermuteUint8x32 OpPermuteUint8x64 @@ -7099,28 +7112,6 @@ const ( OpGetElemUint16x8 OpGetElemUint32x4 OpGetElemUint64x2 - OpPermuteConstantGroupedInt32x8 - OpPermuteConstantGroupedInt32x16 - OpPermuteConstantGroupedUint32x8 - OpPermuteConstantGroupedUint32x16 - OpPermuteConstantHiGroupedInt16x16 - OpPermuteConstantHiGroupedInt16x32 - OpPermuteConstantHiGroupedUint16x16 - OpPermuteConstantHiGroupedUint16x32 - OpPermuteConstantHiInt16x8 - OpPermuteConstantHiInt32x4 - OpPermuteConstantHiUint16x8 - OpPermuteConstantHiUint32x4 - OpPermuteConstantInt32x4 - OpPermuteConstantLoGroupedInt16x16 - OpPermuteConstantLoGroupedInt16x32 - OpPermuteConstantLoGroupedUint16x16 - OpPermuteConstantLoGroupedUint16x32 - OpPermuteConstantLoInt16x8 - OpPermuteConstantLoInt32x4 - OpPermuteConstantLoUint16x8 - OpPermuteConstantLoUint32x4 - OpPermuteConstantUint32x4 OpRotateAllLeftInt32x4 OpRotateAllLeftInt32x8 OpRotateAllLeftInt32x16 @@ -7240,6 +7231,24 @@ const ( OpconcatSelectedConstantInt64x2 OpconcatSelectedConstantUint32x4 OpconcatSelectedConstantUint64x2 + OppermuteScalarsGroupedInt32x8 + OppermuteScalarsGroupedInt32x16 + OppermuteScalarsGroupedUint32x8 + OppermuteScalarsGroupedUint32x16 + OppermuteScalarsHiGroupedInt16x16 + OppermuteScalarsHiGroupedInt16x32 + OppermuteScalarsHiGroupedUint16x16 + OppermuteScalarsHiGroupedUint16x32 + OppermuteScalarsHiInt16x8 + OppermuteScalarsHiUint16x8 + OppermuteScalarsInt32x4 + OppermuteScalarsLoGroupedInt16x16 + OppermuteScalarsLoGroupedInt16x32 + OppermuteScalarsLoGroupedUint16x16 + OppermuteScalarsLoGroupedUint16x32 + OppermuteScalarsLoInt16x8 + OppermuteScalarsLoUint16x8 + OppermuteScalarsUint32x4 OpternInt32x4 OpternInt32x8 OpternInt32x16 @@ -26142,6 +26151,20 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPERMB128", + argLen: 2, + asm: x86.AVPERMB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPERMB256", argLen: 2, @@ -26170,6 +26193,21 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPERMBMasked128", + argLen: 3, + asm: x86.AVPERMB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPERMBMasked256", argLen: 3, @@ -39744,6 +39782,93 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPSHUFLW128", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFLW256", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPSHUFLW512", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFLWMasked128", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFLWMasked256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFLWMasked512", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPSLLD128const", auxType: auxUInt8, @@ -57607,6 +57732,57 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPSHUFLWMasked128Merging", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFLWMasked256Merging", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPSHUFLWMasked512Merging", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPSHUFLW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPSLLDMasked128constMerging", auxType: auxUInt8, @@ -86874,6 +87050,156 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "ConcatPermuteFloat32x4", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteFloat32x8", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteFloat32x16", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteFloat64x2", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteFloat64x4", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteFloat64x8", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt8x16", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt8x32", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt8x64", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt16x8", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt16x16", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt16x32", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt32x4", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt32x8", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt32x16", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt64x2", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt64x4", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteInt64x8", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint8x16", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint8x32", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint8x64", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint16x8", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint16x16", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint16x32", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint32x4", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint32x8", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint32x16", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint64x2", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint64x4", + argLen: 3, + generic: true, + }, + { + name: "ConcatPermuteUint64x8", + argLen: 3, + generic: true, + }, { name: "ConvertToInt8Int16x8", argLen: 1, @@ -89758,242 +90084,102 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "Permute2Float32x4", - argLen: 3, - generic: true, - }, - { - name: "Permute2Float32x8", - argLen: 3, - generic: true, - }, - { - name: "Permute2Float32x16", - argLen: 3, - generic: true, - }, - { - name: "Permute2Float64x2", - argLen: 3, - generic: true, - }, - { - name: "Permute2Float64x4", - argLen: 3, - generic: true, - }, - { - name: "Permute2Float64x8", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int8x16", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int8x32", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int8x64", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int16x8", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int16x16", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int16x32", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int32x4", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int32x8", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int32x16", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int64x2", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int64x4", - argLen: 3, - generic: true, - }, - { - name: "Permute2Int64x8", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint8x16", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint8x32", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint8x64", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint16x8", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint16x16", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint16x32", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint32x4", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint32x8", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint32x16", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint64x2", - argLen: 3, - generic: true, - }, - { - name: "Permute2Uint64x4", - argLen: 3, + name: "PermuteFloat32x8", + argLen: 2, generic: true, }, { - name: "Permute2Uint64x8", - argLen: 3, + name: "PermuteFloat32x16", + argLen: 2, generic: true, }, { - name: "PermuteFloat32x8", + name: "PermuteFloat64x4", argLen: 2, generic: true, }, { - name: "PermuteFloat32x16", + name: "PermuteFloat64x8", argLen: 2, generic: true, }, { - name: "PermuteFloat64x4", + name: "PermuteInt8x16", argLen: 2, generic: true, }, { - name: "PermuteFloat64x8", + name: "PermuteInt8x32", argLen: 2, generic: true, }, { - name: "PermuteGroupedInt8x32", + name: "PermuteInt8x64", argLen: 2, generic: true, }, { - name: "PermuteGroupedInt8x64", + name: "PermuteInt16x8", argLen: 2, generic: true, }, { - name: "PermuteGroupedUint8x32", + name: "PermuteInt16x16", argLen: 2, generic: true, }, { - name: "PermuteGroupedUint8x64", + name: "PermuteInt16x32", argLen: 2, generic: true, }, { - name: "PermuteInt8x16", + name: "PermuteInt32x8", argLen: 2, generic: true, }, { - name: "PermuteInt8x32", + name: "PermuteInt32x16", argLen: 2, generic: true, }, { - name: "PermuteInt8x64", + name: "PermuteInt64x4", argLen: 2, generic: true, }, { - name: "PermuteInt16x8", + name: "PermuteInt64x8", argLen: 2, generic: true, }, { - name: "PermuteInt16x16", + name: "PermuteOrZeroGroupedInt8x32", argLen: 2, generic: true, }, { - name: "PermuteInt16x32", + name: "PermuteOrZeroGroupedInt8x64", argLen: 2, generic: true, }, { - name: "PermuteInt32x8", + name: "PermuteOrZeroGroupedUint8x32", argLen: 2, generic: true, }, { - name: "PermuteInt32x16", + name: "PermuteOrZeroGroupedUint8x64", argLen: 2, generic: true, }, { - name: "PermuteInt64x4", + name: "PermuteOrZeroInt8x16", argLen: 2, generic: true, }, { - name: "PermuteInt64x8", + name: "PermuteOrZeroUint8x16", argLen: 2, generic: true, }, @@ -91830,138 +92016,6 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, - { - name: "PermuteConstantGroupedInt32x8", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantGroupedInt32x16", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantGroupedUint32x8", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantGroupedUint32x16", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantHiGroupedInt16x16", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantHiGroupedInt16x32", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantHiGroupedUint16x16", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantHiGroupedUint16x32", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantHiInt16x8", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantHiInt32x4", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantHiUint16x8", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantHiUint32x4", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantInt32x4", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantLoGroupedInt16x16", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantLoGroupedInt16x32", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantLoGroupedUint16x16", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantLoGroupedUint16x32", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantLoInt16x8", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantLoInt32x4", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantLoUint16x8", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantLoUint32x4", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, - { - name: "PermuteConstantUint32x4", - auxType: auxUInt8, - argLen: 1, - generic: true, - }, { name: "RotateAllLeftInt32x4", auxType: auxUInt8, @@ -92676,6 +92730,114 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "permuteScalarsGroupedInt32x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsGroupedInt32x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsGroupedUint32x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsGroupedUint32x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsHiGroupedInt16x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsHiGroupedInt16x32", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsHiGroupedUint16x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsHiGroupedUint16x32", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsHiInt16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsHiUint16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsInt32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsLoGroupedInt16x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsLoGroupedInt16x32", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsLoGroupedUint16x16", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsLoGroupedUint16x32", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsLoInt16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsLoUint16x8", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "permuteScalarsUint32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, { name: "ternInt32x4", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 76e524d524..5ad2ed3f96 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2546,6 +2546,96 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpCompressUint8x32(v) case OpCompressUint8x64: return rewriteValueAMD64_OpCompressUint8x64(v) + case OpConcatPermuteFloat32x16: + v.Op = OpAMD64VPERMI2PS512 + return true + case OpConcatPermuteFloat32x4: + v.Op = OpAMD64VPERMI2PS128 + return true + case OpConcatPermuteFloat32x8: + v.Op = OpAMD64VPERMI2PS256 + return true + case OpConcatPermuteFloat64x2: + v.Op = OpAMD64VPERMI2PD128 + return true + case OpConcatPermuteFloat64x4: + v.Op = OpAMD64VPERMI2PD256 + return true + case OpConcatPermuteFloat64x8: + v.Op = OpAMD64VPERMI2PD512 + return true + case OpConcatPermuteInt16x16: + v.Op = OpAMD64VPERMI2W256 + return true + case OpConcatPermuteInt16x32: + v.Op = OpAMD64VPERMI2W512 + return true + case OpConcatPermuteInt16x8: + v.Op = OpAMD64VPERMI2W128 + return true + case OpConcatPermuteInt32x16: + v.Op = OpAMD64VPERMI2D512 + return true + case OpConcatPermuteInt32x4: + v.Op = OpAMD64VPERMI2D128 + return true + case OpConcatPermuteInt32x8: + v.Op = OpAMD64VPERMI2D256 + return true + case OpConcatPermuteInt64x2: + v.Op = OpAMD64VPERMI2Q128 + return true + case OpConcatPermuteInt64x4: + v.Op = OpAMD64VPERMI2Q256 + return true + case OpConcatPermuteInt64x8: + v.Op = OpAMD64VPERMI2Q512 + return true + case OpConcatPermuteInt8x16: + v.Op = OpAMD64VPERMI2B128 + return true + case OpConcatPermuteInt8x32: + v.Op = OpAMD64VPERMI2B256 + return true + case OpConcatPermuteInt8x64: + v.Op = OpAMD64VPERMI2B512 + return true + case OpConcatPermuteUint16x16: + v.Op = OpAMD64VPERMI2W256 + return true + case OpConcatPermuteUint16x32: + v.Op = OpAMD64VPERMI2W512 + return true + case OpConcatPermuteUint16x8: + v.Op = OpAMD64VPERMI2W128 + return true + case OpConcatPermuteUint32x16: + v.Op = OpAMD64VPERMI2D512 + return true + case OpConcatPermuteUint32x4: + v.Op = OpAMD64VPERMI2D128 + return true + case OpConcatPermuteUint32x8: + v.Op = OpAMD64VPERMI2D256 + return true + case OpConcatPermuteUint64x2: + v.Op = OpAMD64VPERMI2Q128 + return true + case OpConcatPermuteUint64x4: + v.Op = OpAMD64VPERMI2Q256 + return true + case OpConcatPermuteUint64x8: + v.Op = OpAMD64VPERMI2Q512 + return true + case OpConcatPermuteUint8x16: + v.Op = OpAMD64VPERMI2B128 + return true + case OpConcatPermuteUint8x32: + v.Op = OpAMD64VPERMI2B256 + return true + case OpConcatPermuteUint8x64: + v.Op = OpAMD64VPERMI2B512 + return true case OpConcatShiftBytesRightGroupedUint8x32: v.Op = OpAMD64VPALIGNR256 return true @@ -4476,162 +4566,6 @@ func rewriteValueAMD64(v *Value) bool { case OpPanicBounds: v.Op = OpAMD64LoweredPanicBoundsRR return true - case OpPermute2Float32x16: - v.Op = OpAMD64VPERMI2PS512 - return true - case OpPermute2Float32x4: - v.Op = OpAMD64VPERMI2PS128 - return true - case OpPermute2Float32x8: - v.Op = OpAMD64VPERMI2PS256 - return true - case OpPermute2Float64x2: - v.Op = OpAMD64VPERMI2PD128 - return true - case OpPermute2Float64x4: - v.Op = OpAMD64VPERMI2PD256 - return true - case OpPermute2Float64x8: - v.Op = OpAMD64VPERMI2PD512 - return true - case OpPermute2Int16x16: - v.Op = OpAMD64VPERMI2W256 - return true - case OpPermute2Int16x32: - v.Op = OpAMD64VPERMI2W512 - return true - case OpPermute2Int16x8: - v.Op = OpAMD64VPERMI2W128 - return true - case OpPermute2Int32x16: - v.Op = OpAMD64VPERMI2D512 - return true - case OpPermute2Int32x4: - v.Op = OpAMD64VPERMI2D128 - return true - case OpPermute2Int32x8: - v.Op = OpAMD64VPERMI2D256 - return true - case OpPermute2Int64x2: - v.Op = OpAMD64VPERMI2Q128 - return true - case OpPermute2Int64x4: - v.Op = OpAMD64VPERMI2Q256 - return true - case OpPermute2Int64x8: - v.Op = OpAMD64VPERMI2Q512 - return true - case OpPermute2Int8x16: - v.Op = OpAMD64VPERMI2B128 - return true - case OpPermute2Int8x32: - v.Op = OpAMD64VPERMI2B256 - return true - case OpPermute2Int8x64: - v.Op = OpAMD64VPERMI2B512 - return true - case OpPermute2Uint16x16: - v.Op = OpAMD64VPERMI2W256 - return true - case OpPermute2Uint16x32: - v.Op = OpAMD64VPERMI2W512 - return true - case OpPermute2Uint16x8: - v.Op = OpAMD64VPERMI2W128 - return true - case OpPermute2Uint32x16: - v.Op = OpAMD64VPERMI2D512 - return true - case OpPermute2Uint32x4: - v.Op = OpAMD64VPERMI2D128 - return true - case OpPermute2Uint32x8: - v.Op = OpAMD64VPERMI2D256 - return true - case OpPermute2Uint64x2: - v.Op = OpAMD64VPERMI2Q128 - return true - case OpPermute2Uint64x4: - v.Op = OpAMD64VPERMI2Q256 - return true - case OpPermute2Uint64x8: - v.Op = OpAMD64VPERMI2Q512 - return true - case OpPermute2Uint8x16: - v.Op = OpAMD64VPERMI2B128 - return true - case OpPermute2Uint8x32: - v.Op = OpAMD64VPERMI2B256 - return true - case OpPermute2Uint8x64: - v.Op = OpAMD64VPERMI2B512 - return true - case OpPermuteConstantGroupedInt32x16: - v.Op = OpAMD64VPSHUFD512 - return true - case OpPermuteConstantGroupedInt32x8: - v.Op = OpAMD64VPSHUFD256 - return true - case OpPermuteConstantGroupedUint32x16: - v.Op = OpAMD64VPSHUFD512 - return true - case OpPermuteConstantGroupedUint32x8: - v.Op = OpAMD64VPSHUFD256 - return true - case OpPermuteConstantHiGroupedInt16x16: - v.Op = OpAMD64VPSHUFHW256 - return true - case OpPermuteConstantHiGroupedInt16x32: - v.Op = OpAMD64VPSHUFHW512 - return true - case OpPermuteConstantHiGroupedUint16x16: - v.Op = OpAMD64VPSHUFHW256 - return true - case OpPermuteConstantHiGroupedUint16x32: - v.Op = OpAMD64VPSHUFHW512 - return true - case OpPermuteConstantHiInt16x8: - v.Op = OpAMD64VPSHUFHW128 - return true - case OpPermuteConstantHiInt32x4: - v.Op = OpAMD64VPSHUFHW128 - return true - case OpPermuteConstantHiUint16x8: - v.Op = OpAMD64VPSHUFHW128 - return true - case OpPermuteConstantHiUint32x4: - v.Op = OpAMD64VPSHUFHW128 - return true - case OpPermuteConstantInt32x4: - v.Op = OpAMD64VPSHUFD128 - return true - case OpPermuteConstantLoGroupedInt16x16: - v.Op = OpAMD64VPSHUFHW256 - return true - case OpPermuteConstantLoGroupedInt16x32: - v.Op = OpAMD64VPSHUFHW512 - return true - case OpPermuteConstantLoGroupedUint16x16: - v.Op = OpAMD64VPSHUFHW256 - return true - case OpPermuteConstantLoGroupedUint16x32: - v.Op = OpAMD64VPSHUFHW512 - return true - case OpPermuteConstantLoInt16x8: - v.Op = OpAMD64VPSHUFHW128 - return true - case OpPermuteConstantLoInt32x4: - v.Op = OpAMD64VPSHUFHW128 - return true - case OpPermuteConstantLoUint16x8: - v.Op = OpAMD64VPSHUFHW128 - return true - case OpPermuteConstantLoUint32x4: - v.Op = OpAMD64VPSHUFHW128 - return true - case OpPermuteConstantUint32x4: - v.Op = OpAMD64VPSHUFD128 - return true case OpPermuteFloat32x16: v.Op = OpAMD64VPERMPS512 return true @@ -4644,18 +4578,6 @@ func rewriteValueAMD64(v *Value) bool { case OpPermuteFloat64x8: v.Op = OpAMD64VPERMPD512 return true - case OpPermuteGroupedInt8x32: - v.Op = OpAMD64VPSHUFB256 - return true - case OpPermuteGroupedInt8x64: - v.Op = OpAMD64VPSHUFB512 - return true - case OpPermuteGroupedUint8x32: - v.Op = OpAMD64VPSHUFB256 - return true - case OpPermuteGroupedUint8x64: - v.Op = OpAMD64VPSHUFB512 - return true case OpPermuteInt16x16: v.Op = OpAMD64VPERMW256 return true @@ -4678,7 +4600,7 @@ func rewriteValueAMD64(v *Value) bool { v.Op = OpAMD64VPERMQ512 return true case OpPermuteInt8x16: - v.Op = OpAMD64VPSHUFB128 + v.Op = OpAMD64VPERMB128 return true case OpPermuteInt8x32: v.Op = OpAMD64VPERMB256 @@ -4686,6 +4608,24 @@ func rewriteValueAMD64(v *Value) bool { case OpPermuteInt8x64: v.Op = OpAMD64VPERMB512 return true + case OpPermuteOrZeroGroupedInt8x32: + v.Op = OpAMD64VPSHUFB256 + return true + case OpPermuteOrZeroGroupedInt8x64: + v.Op = OpAMD64VPSHUFB512 + return true + case OpPermuteOrZeroGroupedUint8x32: + v.Op = OpAMD64VPSHUFB256 + return true + case OpPermuteOrZeroGroupedUint8x64: + v.Op = OpAMD64VPSHUFB512 + return true + case OpPermuteOrZeroInt8x16: + v.Op = OpAMD64VPSHUFB128 + return true + case OpPermuteOrZeroUint8x16: + v.Op = OpAMD64VPSHUFB128 + return true case OpPermuteUint16x16: v.Op = OpAMD64VPERMW256 return true @@ -4708,7 +4648,7 @@ func rewriteValueAMD64(v *Value) bool { v.Op = OpAMD64VPERMQ512 return true case OpPermuteUint8x16: - v.Op = OpAMD64VPSHUFB128 + v.Op = OpAMD64VPERMB128 return true case OpPermuteUint8x32: v.Op = OpAMD64VPERMB256 @@ -6124,6 +6064,60 @@ func rewriteValueAMD64(v *Value) bool { case OpconcatSelectedConstantUint64x2: v.Op = OpAMD64VSHUFPD128 return true + case OppermuteScalarsGroupedInt32x16: + v.Op = OpAMD64VPSHUFD512 + return true + case OppermuteScalarsGroupedInt32x8: + v.Op = OpAMD64VPSHUFD256 + return true + case OppermuteScalarsGroupedUint32x16: + v.Op = OpAMD64VPSHUFD512 + return true + case OppermuteScalarsGroupedUint32x8: + v.Op = OpAMD64VPSHUFD256 + return true + case OppermuteScalarsHiGroupedInt16x16: + v.Op = OpAMD64VPSHUFHW256 + return true + case OppermuteScalarsHiGroupedInt16x32: + v.Op = OpAMD64VPSHUFHW512 + return true + case OppermuteScalarsHiGroupedUint16x16: + v.Op = OpAMD64VPSHUFHW256 + return true + case OppermuteScalarsHiGroupedUint16x32: + v.Op = OpAMD64VPSHUFHW512 + return true + case OppermuteScalarsHiInt16x8: + v.Op = OpAMD64VPSHUFHW128 + return true + case OppermuteScalarsHiUint16x8: + v.Op = OpAMD64VPSHUFHW128 + return true + case OppermuteScalarsInt32x4: + v.Op = OpAMD64VPSHUFD128 + return true + case OppermuteScalarsLoGroupedInt16x16: + v.Op = OpAMD64VPSHUFLW256 + return true + case OppermuteScalarsLoGroupedInt16x32: + v.Op = OpAMD64VPSHUFLW512 + return true + case OppermuteScalarsLoGroupedUint16x16: + v.Op = OpAMD64VPSHUFLW256 + return true + case OppermuteScalarsLoGroupedUint16x32: + v.Op = OpAMD64VPSHUFLW512 + return true + case OppermuteScalarsLoInt16x8: + v.Op = OpAMD64VPSHUFLW128 + return true + case OppermuteScalarsLoUint16x8: + v.Op = OpAMD64VPSHUFLW128 + return true + case OppermuteScalarsUint32x4: + v.Op = OpAMD64VPSHUFD128 + return true case OpternInt32x16: v.Op = OpAMD64VPTERNLOGD512 return true @@ -31247,6 +31241,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) + // result: (VPERMI2WMasked128 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2W128 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2WMasked128) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU16Masked128 (VPMOVWB128_128 x) mask) // result: (VPMOVWBMasked128_128 x mask) for { @@ -31460,34 +31468,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) - // result: (VPERMI2WMasked128 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2W128 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2WMasked128) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) - // result: (VPSHUFHWMasked128 [a] x mask) - for { - if v_0.Op != OpAMD64VPSHUFHW128 { - break - } - a := auxIntToUint8(v_0.AuxInt) - x := v_0.Args[0] - mask := v_1 - v.reset(OpAMD64VPSHUFHWMasked128) - v.AuxInt = uint8ToAuxInt(a) - v.AddArg2(x, mask) - return true - } // match: (VMOVDQU16Masked128 (VPERMW128 x y) mask) // result: (VPERMWMasked128 x y mask) for { @@ -31676,6 +31656,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) + // result: (VPSHUFHWMasked128 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFHW128 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFHWMasked128) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) + // result: (VPSHUFLWMasked128 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFLW128 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFLWMasked128) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) // result: (VPSLLWMasked128const [a] x mask) for { @@ -31785,6 +31793,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) + // result: (VPERMI2WMasked256 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2W256 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2WMasked256) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU16Masked256 (VPMOVWB128_256 x) mask) // result: (VPMOVWBMasked128_256 x mask) for { @@ -32034,34 +32056,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) - // result: (VPERMI2WMasked256 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2W256 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2WMasked256) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) - // result: (VPSHUFHWMasked256 [a] x mask) - for { - if v_0.Op != OpAMD64VPSHUFHW256 { - break - } - a := auxIntToUint8(v_0.AuxInt) - x := v_0.Args[0] - mask := v_1 - v.reset(OpAMD64VPSHUFHWMasked256) - v.AuxInt = uint8ToAuxInt(a) - v.AddArg2(x, mask) - return true - } // match: (VMOVDQU16Masked256 (VPERMW256 x y) mask) // result: (VPERMWMasked256 x y mask) for { @@ -32250,6 +32244,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) + // result: (VPSHUFHWMasked256 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFHW256 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFHWMasked256) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) + // result: (VPSHUFLWMasked256 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFLW256 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFLWMasked256) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) // result: (VPSLLWMasked256const [a] x mask) for { @@ -32359,6 +32381,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) + // result: (VPERMI2WMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2W512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2WMasked512) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU16Masked512 (VPMOVSXWD512 x) mask) // result: (VPMOVSXWDMasked512 x mask) for { @@ -32536,34 +32572,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) - // result: (VPERMI2WMasked512 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2W512 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2WMasked512) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) - // result: (VPSHUFHWMasked512 [a] x mask) - for { - if v_0.Op != OpAMD64VPSHUFHW512 { - break - } - a := auxIntToUint8(v_0.AuxInt) - x := v_0.Args[0] - mask := v_1 - v.reset(OpAMD64VPSHUFHWMasked512) - v.AuxInt = uint8ToAuxInt(a) - v.AddArg2(x, mask) - return true - } // match: (VMOVDQU16Masked512 (VPERMW512 x y) mask) // result: (VPERMWMasked512 x y mask) for { @@ -32752,6 +32760,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) + // result: (VPSHUFHWMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFHW512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFHWMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) + // result: (VPSHUFLWMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFLW512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFLWMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) // result: (VPSLLWMasked512const [a] x mask) for { @@ -32875,6 +32911,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) + // result: (VPERMI2PSMasked128 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2PS128 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2PSMasked128) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) + // result: (VPERMI2DMasked128 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2D128 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2DMasked128) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU32Masked128 (VPMOVDB128_128 x) mask) // result: (VPMOVDBMasked128_128 x mask) for { @@ -33232,48 +33296,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) - // result: (VPERMI2PSMasked128 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2PS128 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2PSMasked128) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) - // result: (VPERMI2DMasked128 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2D128 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2DMasked128) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) - // result: (VPSHUFDMasked128 [a] x mask) - for { - if v_0.Op != OpAMD64VPSHUFD128 { - break - } - a := auxIntToUint8(v_0.AuxInt) - x := v_0.Args[0] - mask := v_1 - v.reset(OpAMD64VPSHUFDMasked128) - v.AuxInt = uint8ToAuxInt(a) - v.AddArg2(x, mask) - return true - } // match: (VMOVDQU32Masked128 (VPROLD128 [a] x) mask) // result: (VPROLDMasked128 [a] x mask) for { @@ -33515,6 +33537,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) + // result: (VPSHUFDMasked128 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFD128 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFDMasked128) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU32Masked128 (VPSLLD128const [a] x) mask) // result: (VPSLLDMasked128const [a] x mask) for { @@ -33638,6 +33674,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) + // result: (VPERMI2PSMasked256 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2PS256 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2PSMasked256) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) + // result: (VPERMI2DMasked256 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2D256 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2DMasked256) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU32Masked256 (VPMOVDB128_256 x) mask) // result: (VPMOVDBMasked128_256 x mask) for { @@ -34031,48 +34095,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) - // result: (VPERMI2PSMasked256 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2PS256 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2PSMasked256) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) - // result: (VPERMI2DMasked256 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2D256 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2DMasked256) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) - // result: (VPSHUFDMasked256 [a] x mask) - for { - if v_0.Op != OpAMD64VPSHUFD256 { - break - } - a := auxIntToUint8(v_0.AuxInt) - x := v_0.Args[0] - mask := v_1 - v.reset(OpAMD64VPSHUFDMasked256) - v.AuxInt = uint8ToAuxInt(a) - v.AddArg2(x, mask) - return true - } // match: (VMOVDQU32Masked256 (VPERMPS256 x y) mask) // result: (VPERMPSMasked256 x y mask) for { @@ -34340,6 +34362,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) + // result: (VPSHUFDMasked256 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFD256 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFDMasked256) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU32Masked256 (VPSLLD256const [a] x) mask) // result: (VPSLLDMasked256const [a] x mask) for { @@ -34489,6 +34525,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) + // result: (VPERMI2PSMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2PS512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2PSMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) + // result: (VPERMI2DMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2D512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2DMasked512) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU32Masked512 (VPMOVDB128_512 x) mask) // result: (VPMOVDBMasked128_512 x mask) for { @@ -34823,48 +34887,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg3(x, y, mask) return true } - // match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) - // result: (VPERMI2PSMasked512 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2PS512 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2PSMasked512) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) - // result: (VPERMI2DMasked512 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2D512 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2DMasked512) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) - // result: (VPSHUFDMasked512 [a] x mask) - for { - if v_0.Op != OpAMD64VPSHUFD512 { - break - } - a := auxIntToUint8(v_0.AuxInt) - x := v_0.Args[0] - mask := v_1 - v.reset(OpAMD64VPSHUFDMasked512) - v.AuxInt = uint8ToAuxInt(a) - v.AddArg2(x, mask) - return true - } // match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask) // result: (VPERMPSMasked512 x y mask) for { @@ -35169,6 +35191,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) + // result: (VPSHUFDMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPSHUFD512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHUFDMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU32Masked512 (VPSLLD512const [a] x) mask) // result: (VPSLLDMasked512const [a] x mask) for { @@ -35280,6 +35316,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked128(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) + // result: (VPERMI2PDMasked128 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2PD128 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2PDMasked128) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) + // result: (VPERMI2QMasked128 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2Q128 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2QMasked128) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU64Masked128 (VPMOVQB128_128 x) mask) // result: (VPMOVQBMasked128_128 x mask) for { @@ -35571,34 +35635,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked128(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) - // result: (VPERMI2PDMasked128 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2PD128 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2PDMasked128) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) - // result: (VPERMI2QMasked128 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2Q128 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2QMasked128) - v.AddArg4(x, y, z, mask) - return true - } // match: (VMOVDQU64Masked128 (VRCP14PD128 x) mask) // result: (VRCP14PDMasked128 x mask) for { @@ -35987,6 +36023,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked256(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) + // result: (VPERMI2PDMasked256 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2PD256 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2PDMasked256) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) + // result: (VPERMI2QMasked256 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2Q256 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2QMasked256) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU64Masked256 (VPMOVQB128_256 x) mask) // result: (VPMOVQBMasked128_256 x mask) for { @@ -36314,34 +36378,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked256(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) - // result: (VPERMI2PDMasked256 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2PD256 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2PDMasked256) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) - // result: (VPERMI2QMasked256 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2Q256 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2QMasked256) - v.AddArg4(x, y, z, mask) - return true - } // match: (VMOVDQU64Masked256 (VPERMPD256 x y) mask) // result: (VPERMPDMasked256 x y mask) for { @@ -36782,6 +36818,34 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) + // result: (VPERMI2PDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2PD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2PDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) + // result: (VPERMI2QMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2Q512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2QMasked512) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU64Masked512 (VPMOVQB128_512 x) mask) // result: (VPMOVQBMasked128_512 x mask) for { @@ -37050,34 +37114,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool { v.AddArg3(x, y, mask) return true } - // match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) - // result: (VPERMI2PDMasked512 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2PD512 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2PDMasked512) - v.AddArg4(x, y, z, mask) - return true - } - // match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) - // result: (VPERMI2QMasked512 x y z mask) - for { - if v_0.Op != OpAMD64VPERMI2Q512 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPERMI2QMasked512) - v.AddArg4(x, y, z, mask) - return true - } // match: (VMOVDQU64Masked512 (VPERMPD512 x y) mask) // result: (VPERMPDMasked512 x y mask) for { @@ -37491,6 +37527,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked128(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) + // result: (VPERMI2BMasked128 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2B128 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2BMasked128) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) // result: (VPALIGNRMasked128 [a] x y mask) for { @@ -37685,18 +37735,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked128(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) - // result: (VPERMI2BMasked128 x y z mask) + // match: (VMOVDQU8Masked128 (VPERMB128 x y) mask) + // result: (VPERMBMasked128 x y mask) for { - if v_0.Op != OpAMD64VPERMI2B128 { + if v_0.Op != OpAMD64VPERMB128 { break } - z := v_0.Args[2] - x := v_0.Args[0] y := v_0.Args[1] + x := v_0.Args[0] mask := v_1 - v.reset(OpAMD64VPERMI2BMasked128) - v.AddArg4(x, y, z, mask) + v.reset(OpAMD64VPERMBMasked128) + v.AddArg3(x, y, mask) return true } // match: (VMOVDQU8Masked128 (VPSHUFB128 x y) mask) @@ -37832,6 +37881,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) + // result: (VPERMI2BMasked256 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2B256 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2BMasked256) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) // result: (VPALIGNRMasked256 [a] x y mask) for { @@ -38026,18 +38089,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) - // result: (VPERMI2BMasked256 x y z mask) + // match: (VMOVDQU8Masked256 (VPERMB256 x y) mask) + // result: (VPERMBMasked256 x y mask) for { - if v_0.Op != OpAMD64VPERMI2B256 { + if v_0.Op != OpAMD64VPERMB256 { break } - z := v_0.Args[2] - x := v_0.Args[0] y := v_0.Args[1] + x := v_0.Args[0] mask := v_1 - v.reset(OpAMD64VPERMI2BMasked256) - v.AddArg4(x, y, z, mask) + v.reset(OpAMD64VPERMBMasked256) + v.AddArg3(x, y, mask) return true } // match: (VMOVDQU8Masked256 (VPSHUFB256 x y) mask) @@ -38053,19 +38115,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool { v.AddArg3(x, y, mask) return true } - // match: (VMOVDQU8Masked256 (VPERMB256 x y) mask) - // result: (VPERMBMasked256 x y mask) - for { - if v_0.Op != OpAMD64VPERMB256 { - break - } - y := v_0.Args[1] - x := v_0.Args[0] - mask := v_1 - v.reset(OpAMD64VPERMBMasked256) - v.AddArg3(x, y, mask) - return true - } // match: (VMOVDQU8Masked256 (VPSUBB256 x y) mask) // result: (VPSUBBMasked256 x y mask) for { @@ -38186,6 +38235,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) + // result: (VPERMI2BMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2B512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2BMasked512) + v.AddArg4(x, y, z, mask) + return true + } // match: (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) // result: (VPALIGNRMasked512 [a] x y mask) for { @@ -38380,18 +38443,17 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool { v.AddArg2(x, mask) return true } - // match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) - // result: (VPERMI2BMasked512 x y z mask) + // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask) + // result: (VPERMBMasked512 x y mask) for { - if v_0.Op != OpAMD64VPERMI2B512 { + if v_0.Op != OpAMD64VPERMB512 { break } - z := v_0.Args[2] - x := v_0.Args[0] y := v_0.Args[1] + x := v_0.Args[0] mask := v_1 - v.reset(OpAMD64VPERMI2BMasked512) - v.AddArg4(x, y, z, mask) + v.reset(OpAMD64VPERMBMasked512) + v.AddArg3(x, y, mask) return true } // match: (VMOVDQU8Masked512 (VPSHUFB512 x y) mask) @@ -38407,19 +38469,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool { v.AddArg3(x, y, mask) return true } - // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask) - // result: (VPERMBMasked512 x y mask) - for { - if v_0.Op != OpAMD64VPERMB512 { - break - } - y := v_0.Args[1] - x := v_0.Args[0] - mask := v_1 - v.reset(OpAMD64VPERMBMasked512) - v.AddArg3(x, y, mask) - return true - } // match: (VMOVDQU8Masked512 (VPSUBB512 x y) mask) // result: (VPSUBBMasked512 x y mask) for { @@ -42642,6 +42691,21 @@ func rewriteValueAMD64_OpAMD64VPBLENDMWMasked512(v *Value) bool { v.AddArg3(dst, x, mask) return true } + // match: (VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) + // result: (VPSHUFLWMasked512Merging dst [a] x mask) + for { + dst := v_0 + if v_1.Op != OpAMD64VPSHUFLW512 { + break + } + a := auxIntToUint8(v_1.AuxInt) + x := v_1.Args[0] + mask := v_2 + v.reset(OpAMD64VPSHUFLWMasked512Merging) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(dst, x, mask) + return true + } // match: (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) // result: (VPSLLVWMasked512Merging dst x y mask) for { @@ -45526,6 +45590,27 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool { v.AddArg3(dst, x, v0) return true } + // match: (VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPSHUFLW128 { + break + } + a := auxIntToUint8(v_1.AuxInt) + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPSHUFLWMasked128Merging) + v.AuxInt = uint8ToAuxInt(a) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(dst, x, v0) + return true + } // match: (VPBLENDVB128 dst (VPSLLD128const [a] x) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM mask)) @@ -48223,6 +48308,27 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool { v.AddArg3(dst, x, v0) return true } + // match: (VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPSHUFLW256 { + break + } + a := auxIntToUint8(v_1.AuxInt) + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPSHUFLWMasked256Merging) + v.AuxInt = uint8ToAuxInt(a) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(dst, x, v0) + return true + } // match: (VPBLENDVB256 dst (VPSLLD256const [a] x) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM mask)) diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 818b3544ae..34e491371e 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -228,6 +228,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64) @@ -802,8 +832,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64) @@ -826,62 +856,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64) - addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64) - addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64) - addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64) - addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64) - addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64) - addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64) - addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64) - addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64) - addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64) - addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64) - addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64) - addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64) - addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64) @@ -1300,6 +1280,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsInt32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsUint32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiInt16x8, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiUint16x8, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoInt16x8, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoUint16x8, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64) diff --git a/src/simd/_gen/simdgen/gen_simdGenericOps.go b/src/simd/_gen/simdgen/gen_simdGenericOps.go index 3dbbeb09f7..bcbc18b3b2 100644 --- a/src/simd/_gen/simdgen/gen_simdGenericOps.go +++ b/src/simd/_gen/simdgen/gen_simdGenericOps.go @@ -46,6 +46,9 @@ func writeSIMDGenericOps(ops []Operation) *bytes.Buffer { if op.NoGenericOps != nil && *op.NoGenericOps == "true" { continue } + if op.SkipMaskedMethod() { + continue + } _, _, _, immType, gOp := op.shape() gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative} if immType == VarImm || immType == ConstVarImm { diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go index b963fb9abb..04344dc831 100644 --- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go +++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go @@ -107,6 +107,9 @@ func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer { if op.NoTypes != nil && *op.NoTypes == "true" { continue } + if op.SkipMaskedMethod() { + continue + } if s, op, err := classifyOp(op); err == nil { if err := t.ExecuteTemplate(buffer, s, op); err != nil { panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err)) diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go index 23b363d38a..dc5f77adaa 100644 --- a/src/simd/_gen/simdgen/gen_simdTypes.go +++ b/src/simd/_gen/simdgen/gen_simdTypes.go @@ -604,6 +604,9 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer) if op.NoTypes != nil && *op.NoTypes == "true" { continue } + if op.SkipMaskedMethod() { + continue + } idxVecAsScalar, err := checkVecAsScalar(op) if err != nil { panic(err) diff --git a/src/simd/_gen/simdgen/gen_simdrules.go b/src/simd/_gen/simdgen/gen_simdrules.go index 19393add71..5693496c92 100644 --- a/src/simd/_gen/simdgen/gen_simdrules.go +++ b/src/simd/_gen/simdgen/gen_simdrules.go @@ -345,7 +345,8 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer { data.ArgsOut = "..." } data.tplName = tplName - if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" { + if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" || + opr.SkipMaskedMethod() { optData = append(optData, data) continue } diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go index 7d3943b4b8..0b8fbd7e3d 100644 --- a/src/simd/_gen/simdgen/godefs.go +++ b/src/simd/_gen/simdgen/godefs.go @@ -73,6 +73,29 @@ type rawOperation struct { NoGenericOps *string // If non-nil, this string will be attached to the machine ssa op name. E.g. "const" SSAVariant *string + // If true, do not emit method declarations, generic ops, or intrinsics for masked variants + // DO emit the architecture-specific opcodes and optimizations. + HideMaskMethods *bool +} + +func (o *Operation) IsMasked() bool { + if len(o.InVariant) == 0 { + return false + } + if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" { + return true + } + panic(fmt.Errorf("unknown inVariant")) +} + +func (o *Operation) SkipMaskedMethod() bool { + if o.HideMaskMethods == nil { + return false + } + if *o.HideMaskMethods && o.IsMasked() { + return true + } + return false } func (o *Operation) DecodeUnified(v *unify.Value) error { @@ -80,14 +103,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error { return err } - isMasked := false - if len(o.InVariant) == 0 { - // No variant - } else if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" { - isMasked = true - } else { - return fmt.Errorf("unknown inVariant") - } + isMasked := o.IsMasked() // Compute full Go method name. o.Go = o.rawOperation.Go @@ -104,6 +120,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error { o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go) if isMasked { o.Documentation += "\n//\n// This operation is applied selectively under a write mask." + // Suppress generic op and method declaration for exported methods, if a mask is present. if unicode.IsUpper([]rune(o.Go)[0]) { trueVal := "true" o.NoGenericOps = &trueVal diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index bb47819f2f..44bd8efb7f 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -27,18 +27,22 @@ constImm: 1 documentation: !string |- // NAME returns the upper half of x. +- go: PermuteOrZero + commutative: false + documentation: !string |- + // NAME performs a full permutation of vector x using indices: + // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} - go: Permute commutative: false documentation: !string |- // NAME performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} - // Only the needed bits to represent x's index are used in indices' elements. -- go: Permute2 # Permute2 is only available on or after AVX512 +- go: ConcatPermute # ConcatPermute is only available on or after AVX512 commutative: false documentation: !string |- // NAME performs a full permutation of vector x, y using indices: // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} - // where xy is x appending y. + // where xy is the concatenation of x (lower half) and y (upper half). // Only the needed bits to represent xy's index are used in indices' elements. - go: Compress commutative: false @@ -74,31 +78,35 @@ documentation: !string |- // NAME copies element zero of its (128-bit) input to all elements of // the 512-bit output vector. +- go: PermuteOrZeroGrouped + commutative: false + documentation: !string |- # Detailed documentation will rely on the specific ops. + // NAME performs a grouped permutation of vector x using indices: - go: PermuteGrouped commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. // NAME performs a grouped permutation of vector x using indices: -- go: PermuteConstant +- go: permuteScalars commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. // NAME performs a permutation of vector x using constant indices: -- go: PermuteConstantGrouped +- go: permuteScalarsGrouped commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. // NAME performs a grouped permutation of vector x using constant indices: -- go: PermuteConstantLo +- go: permuteScalarsLo commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. // NAME performs a permutation of vector x using constant indices: -- go: PermuteConstantLoGrouped +- go: permuteScalarsLoGrouped commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. // NAME performs a grouped permutation of vector x using constant indices: -- go: PermuteConstantHi +- go: permuteScalarsHi commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. // NAME performs a permutation of vector x using constant indices: -- go: PermuteConstantHiGrouped +- go: permuteScalarsHiGrouped commutative: false documentation: !string |- # Detailed documentation will rely on the specific ops. // NAME performs a grouped permutation of vector x using constant indices: @@ -218,8 +226,10 @@ - go: Select128FromPair commutative: false documentation: !string |- - // NAME selects the low and high 128-bit halves from the 128-bit halves - // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. + // NAME treats the 256-bit vectors x and y as a single vector of four + // 128-bit elements, and returns a 256-bit result formed by + // concatenating the two elements specified by lo and hi. + // For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}. - go: ConcatShiftBytesRight commutative: false diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 75fbc532b8..697d6a8bce 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -213,19 +213,75 @@ - *f64xN - go: Permute - asm: "VPERM[BWDQ]|VPERMP[SD]" + asm: "VPERMQ|VPERMPD" + addDoc: !string |- + // The low 2 bits (values 0-3) of each element of indices is used operandOrder: "21Type1" in: - &anyindices go: $t name: indices overwriteBase: uint + - &any4 + go: $t + lanes: 4 + out: - &any go: $t + +- go: Permute + asm: "VPERM[WDQ]|VPERMP[SD]" + addDoc: !string |- + // The low 3 bits (values 0-7) of each element of indices is used + operandOrder: "21Type1" + in: + - *anyindices + - &any8 + go: $t + lanes: 8 + out: + - *any + +- go: Permute + asm: "VPERM[BWD]|VPERMPS" + addDoc: !string |- + // The low 4 bits (values 0-15) of each element of indices is used + operandOrder: "21Type1" + in: + - *anyindices + - &any16 + go: $t + lanes: 16 out: - *any -- go: Permute2 +- go: Permute + asm: "VPERM[BW]" + addDoc: !string |- + // The low 5 bits (values 0-31) of each element of indices is used + operandOrder: "21Type1" + in: + - *anyindices + - &any32 + go: $t + lanes: 32 + out: + - *any + +- go: Permute + asm: "VPERMB" + addDoc: !string |- + // The low 6 bits (values 0-63) of each element of indices is used + operandOrder: "21Type1" + in: + - *anyindices + - &any64 + go: $t + lanes: 64 + out: + - *any + +- go: ConcatPermute asm: "VPERMI2[BWDQ]|VPERMI2P[SD]" # Because we are overwriting the receiver's type, we # have to move the receiver to be a parameter so that @@ -403,113 +459,137 @@ base: $b # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX) -- go: Permute +- go: PermuteOrZero asm: VPSHUFB addDoc: !string |- - // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. + // The lower four bits of each byte-sized index in indices select an element from x, + // unless the index's sign bit is set in which case zero is used instead. in: - &128any bits: 128 go: $t - bits: 128 - go: $t name: indices + base: int # always signed out: - *128any -- go: PermuteGrouped + +- go: PermuteOrZeroGrouped asm: VPSHUFB addDoc: !string |- - // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} - // Only the needed bits to represent the index of a group of x are used in indices' elements. - // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. + // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} + // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x, + // unless the index's sign bit is set in which case zero is used instead. // Each group is of size 128-bit. in: - &256Or512any bits: "256|512" go: $t - bits: "256|512" - go: $t + base: int name: indices out: - *256Or512any -- go: PermuteConstant +- go: permuteScalars asm: VPSHUFD addDoc: !string |- - // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} - // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} + // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. in: - *128any - class: immediate immOffset: 0 name: indices + hideMaskMethods: true out: - *128any -- go: PermuteConstantGrouped + +- go: permuteScalarsGrouped asm: VPSHUFD addDoc: !string |- - // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} - // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} + // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. // Each group is of size 128-bit. in: - *256Or512any - class: immediate immOffset: 0 name: indices + hideMaskMethods: true out: - *256Or512any -- go: PermuteConstantLo - asm: VPSHUFHW +- go: permuteScalarsLo + asm: VPSHUFLW addDoc: !string |- - // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} - // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} + // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. in: - - *128any + - &128lanes8 + bits: 128 + go: $t + elemBits: 16 - class: immediate immOffset: 0 name: indices + hideMaskMethods: true out: - - *128any -- go: PermuteConstantLoGrouped - asm: VPSHUFHW + - *128lanes8 + +- go: permuteScalarsLoGrouped + asm: VPSHUFLW addDoc: !string |- - // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} - // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // + // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], + // x_group1[indices[0:2]], ...} + // + // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. // Each group is of size 128-bit. in: - - *256Or512any + - &256Or512lanes8 + bits: "256|512" + go: $t + elemBits: 16 - class: immediate immOffset: 0 name: indices + hideMaskMethods: true out: - - *256Or512any + - *256Or512lanes8 -- go: PermuteConstantHi +- go: permuteScalarsHi asm: VPSHUFHW addDoc: !string |- - // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} - // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} + // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. in: - - *128any + - *128lanes8 - class: immediate immOffset: 0 name: indices + hideMaskMethods: true out: - - *128any -- go: PermuteConstantHiGrouped + - *128lanes8 + +- go: permuteScalarsHiGrouped asm: VPSHUFHW addDoc: !string |- - // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} - // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. + // result = + // + // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], + // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} + // + // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. // Each group is of size 128-bit. in: - - *256Or512any + - *256Or512lanes8 - class: immediate immOffset: 0 name: indices + hideMaskMethods: true out: - - *256Or512any + - *256Or512lanes8 - go: InterleaveHi asm: VPUNPCKH(QDQ|DQ|WD|WB) diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index 2d7793ef05..f51e3dc15f 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -163,7 +163,20 @@ func TestPermute(t *testing.T) { } } -func TestPermute2(t *testing.T) { +func TestPermuteOrZero(t *testing.T) { + x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} + indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11} + want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12} + got := make([]uint8, len(x)) + simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got) + for i := range 8 { + if want[i] != got[i] { + t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) + } + } +} + +func TestConcatPermute(t *testing.T) { if !simd.X86.AVX512() { t.Skip("Test requires X86.AVX512, not available on this hardware") return @@ -173,7 +186,7 @@ func TestPermute2(t *testing.T) { indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0} want := []int64{-8, 7, -6, 5, -4, 3, -2, 1} got := make([]int64, 8) - simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got) + simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got) for i := range 8 { if want[i] != got[i] { t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) @@ -1161,3 +1174,75 @@ func TestDotProductQuadruple(t *testing.T) { } } } + +func TestPermuteScalars(t *testing.T) { + x := []int32{11, 12, 13, 14} + want := []int32{12, 13, 14, 11} + got := make([]int32, 4) + simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got) + for i := range 4 { + if want[i] != got[i] { + t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) + } + } +} + +func TestPermuteScalarsGrouped(t *testing.T) { + x := []int32{11, 12, 13, 14, 21, 22, 23, 24} + want := []int32{12, 13, 14, 11, 22, 23, 24, 21} + got := make([]int32, 8) + simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got) + for i := range 8 { + if want[i] != got[i] { + t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) + } + } +} + +func TestPermuteScalarsHi(t *testing.T) { + x := []int16{-1, -2, -3, -4, 11, 12, 13, 14} + want := []int16{-1, -2, -3, -4, 12, 13, 14, 11} + got := make([]int16, len(x)) + simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got) + for i := range got { + if want[i] != got[i] { + t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) + } + } +} + +func TestPermuteScalarsLo(t *testing.T) { + x := []int16{11, 12, 13, 14, 4, 5, 6, 7} + want := []int16{12, 13, 14, 11, 4, 5, 6, 7} + got := make([]int16, len(x)) + simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got) + for i := range got { + if want[i] != got[i] { + t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) + } + } +} + +func TestPermuteScalarsHiGrouped(t *testing.T) { + x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114} + want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111} + got := make([]int16, len(x)) + simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got) + for i := range got { + if want[i] != got[i] { + t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) + } + } +} + +func TestPermuteScalarsLoGrouped(t *testing.T) { + x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17} + want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17} + got := make([]int16, len(x)) + simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got) + for i := range got { + if want[i] != got[i] { + t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) + } + } +} diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index e06d1f652e..e9ddb463be 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -1272,6 +1272,248 @@ func (x Uint64x4) Compress(mask Mask64x4) Uint64x4 // Asm: VPCOMPRESSQ, CPU Feature: AVX512 func (x Uint64x8) Compress(mask Mask64x8) Uint64x8 +/* ConcatPermute */ + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2B, CPU Feature: AVX512VBMI +func (x Int8x16) ConcatPermute(y Int8x16, indices Uint8x16) Int8x16 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2B, CPU Feature: AVX512VBMI +func (x Uint8x16) ConcatPermute(y Uint8x16, indices Uint8x16) Uint8x16 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2B, CPU Feature: AVX512VBMI +func (x Int8x32) ConcatPermute(y Int8x32, indices Uint8x32) Int8x32 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2B, CPU Feature: AVX512VBMI +func (x Uint8x32) ConcatPermute(y Uint8x32, indices Uint8x32) Uint8x32 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2B, CPU Feature: AVX512VBMI +func (x Int8x64) ConcatPermute(y Int8x64, indices Uint8x64) Int8x64 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2B, CPU Feature: AVX512VBMI +func (x Uint8x64) ConcatPermute(y Uint8x64, indices Uint8x64) Uint8x64 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2W, CPU Feature: AVX512 +func (x Int16x8) ConcatPermute(y Int16x8, indices Uint16x8) Int16x8 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2W, CPU Feature: AVX512 +func (x Uint16x8) ConcatPermute(y Uint16x8, indices Uint16x8) Uint16x8 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2W, CPU Feature: AVX512 +func (x Int16x16) ConcatPermute(y Int16x16, indices Uint16x16) Int16x16 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2W, CPU Feature: AVX512 +func (x Uint16x16) ConcatPermute(y Uint16x16, indices Uint16x16) Uint16x16 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2W, CPU Feature: AVX512 +func (x Int16x32) ConcatPermute(y Int16x32, indices Uint16x32) Int16x32 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2W, CPU Feature: AVX512 +func (x Uint16x32) ConcatPermute(y Uint16x32, indices Uint16x32) Uint16x32 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2PS, CPU Feature: AVX512 +func (x Float32x4) ConcatPermute(y Float32x4, indices Uint32x4) Float32x4 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2D, CPU Feature: AVX512 +func (x Int32x4) ConcatPermute(y Int32x4, indices Uint32x4) Int32x4 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2D, CPU Feature: AVX512 +func (x Uint32x4) ConcatPermute(y Uint32x4, indices Uint32x4) Uint32x4 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2PS, CPU Feature: AVX512 +func (x Float32x8) ConcatPermute(y Float32x8, indices Uint32x8) Float32x8 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2D, CPU Feature: AVX512 +func (x Int32x8) ConcatPermute(y Int32x8, indices Uint32x8) Int32x8 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2D, CPU Feature: AVX512 +func (x Uint32x8) ConcatPermute(y Uint32x8, indices Uint32x8) Uint32x8 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2PS, CPU Feature: AVX512 +func (x Float32x16) ConcatPermute(y Float32x16, indices Uint32x16) Float32x16 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2D, CPU Feature: AVX512 +func (x Int32x16) ConcatPermute(y Int32x16, indices Uint32x16) Int32x16 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2D, CPU Feature: AVX512 +func (x Uint32x16) ConcatPermute(y Uint32x16, indices Uint32x16) Uint32x16 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2PD, CPU Feature: AVX512 +func (x Float64x2) ConcatPermute(y Float64x2, indices Uint64x2) Float64x2 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2Q, CPU Feature: AVX512 +func (x Int64x2) ConcatPermute(y Int64x2, indices Uint64x2) Int64x2 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2Q, CPU Feature: AVX512 +func (x Uint64x2) ConcatPermute(y Uint64x2, indices Uint64x2) Uint64x2 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2PD, CPU Feature: AVX512 +func (x Float64x4) ConcatPermute(y Float64x4, indices Uint64x4) Float64x4 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2Q, CPU Feature: AVX512 +func (x Int64x4) ConcatPermute(y Int64x4, indices Uint64x4) Int64x4 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2Q, CPU Feature: AVX512 +func (x Uint64x4) ConcatPermute(y Uint64x4, indices Uint64x4) Uint64x4 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2PD, CPU Feature: AVX512 +func (x Float64x8) ConcatPermute(y Float64x8, indices Uint64x8) Float64x8 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2Q, CPU Feature: AVX512 +func (x Int64x8) ConcatPermute(y Int64x8, indices Uint64x8) Int64x8 + +// ConcatPermute performs a full permutation of vector x, y using indices: +// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} +// where xy is the concatenation of x (lower half) and y (upper half). +// Only the needed bits to represent xy's index are used in indices' elements. +// +// Asm: VPERMI2Q, CPU Feature: AVX512 +func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8 + /* ConcatShiftBytesRight */ // ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes. @@ -4551,675 +4793,227 @@ func (x Uint64x8) Or(y Uint64x8) Uint64x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// The low 4 bits (values 0-15) of each element of indices is used // -// Asm: VPSHUFB, CPU Feature: AVX -func (x Int8x16) Permute(indices Int8x16) Int8x16 +// Asm: VPERMB, CPU Feature: AVX512VBMI +func (x Int8x16) Permute(indices Uint8x16) Int8x16 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. -// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// The low 4 bits (values 0-15) of each element of indices is used // -// Asm: VPSHUFB, CPU Feature: AVX +// Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x16) Permute(indices Uint8x16) Uint8x16 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 5 bits (values 0-31) of each element of indices is used // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Int8x32) Permute(indices Uint8x32) Int8x32 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 5 bits (values 0-31) of each element of indices is used // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x32) Permute(indices Uint8x32) Uint8x32 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 6 bits (values 0-63) of each element of indices is used // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Int8x64) Permute(indices Uint8x64) Int8x64 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 6 bits (values 0-63) of each element of indices is used // // Asm: VPERMB, CPU Feature: AVX512VBMI func (x Uint8x64) Permute(indices Uint8x64) Uint8x64 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 3 bits (values 0-7) of each element of indices is used // // Asm: VPERMW, CPU Feature: AVX512 func (x Int16x8) Permute(indices Uint16x8) Int16x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 3 bits (values 0-7) of each element of indices is used // // Asm: VPERMW, CPU Feature: AVX512 func (x Uint16x8) Permute(indices Uint16x8) Uint16x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 4 bits (values 0-15) of each element of indices is used // // Asm: VPERMW, CPU Feature: AVX512 func (x Int16x16) Permute(indices Uint16x16) Int16x16 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 4 bits (values 0-15) of each element of indices is used // // Asm: VPERMW, CPU Feature: AVX512 func (x Uint16x16) Permute(indices Uint16x16) Uint16x16 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 5 bits (values 0-31) of each element of indices is used // // Asm: VPERMW, CPU Feature: AVX512 func (x Int16x32) Permute(indices Uint16x32) Int16x32 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 5 bits (values 0-31) of each element of indices is used // // Asm: VPERMW, CPU Feature: AVX512 func (x Uint16x32) Permute(indices Uint16x32) Uint16x32 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 3 bits (values 0-7) of each element of indices is used // // Asm: VPERMPS, CPU Feature: AVX2 func (x Float32x8) Permute(indices Uint32x8) Float32x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 3 bits (values 0-7) of each element of indices is used // // Asm: VPERMD, CPU Feature: AVX2 func (x Int32x8) Permute(indices Uint32x8) Int32x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 3 bits (values 0-7) of each element of indices is used // // Asm: VPERMD, CPU Feature: AVX2 func (x Uint32x8) Permute(indices Uint32x8) Uint32x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 4 bits (values 0-15) of each element of indices is used // // Asm: VPERMPS, CPU Feature: AVX512 func (x Float32x16) Permute(indices Uint32x16) Float32x16 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 4 bits (values 0-15) of each element of indices is used // // Asm: VPERMD, CPU Feature: AVX512 func (x Int32x16) Permute(indices Uint32x16) Int32x16 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 4 bits (values 0-15) of each element of indices is used // // Asm: VPERMD, CPU Feature: AVX512 func (x Uint32x16) Permute(indices Uint32x16) Uint32x16 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 2 bits (values 0-3) of each element of indices is used // // Asm: VPERMPD, CPU Feature: AVX512 func (x Float64x4) Permute(indices Uint64x4) Float64x4 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 2 bits (values 0-3) of each element of indices is used // // Asm: VPERMQ, CPU Feature: AVX512 func (x Int64x4) Permute(indices Uint64x4) Int64x4 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 2 bits (values 0-3) of each element of indices is used // // Asm: VPERMQ, CPU Feature: AVX512 func (x Uint64x4) Permute(indices Uint64x4) Uint64x4 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 3 bits (values 0-7) of each element of indices is used // // Asm: VPERMPD, CPU Feature: AVX512 func (x Float64x8) Permute(indices Uint64x8) Float64x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 3 bits (values 0-7) of each element of indices is used // // Asm: VPERMQ, CPU Feature: AVX512 func (x Int64x8) Permute(indices Uint64x8) Int64x8 // Permute performs a full permutation of vector x using indices: // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} -// Only the needed bits to represent x's index are used in indices' elements. +// The low 3 bits (values 0-7) of each element of indices is used // // Asm: VPERMQ, CPU Feature: AVX512 func (x Uint64x8) Permute(indices Uint64x8) Uint64x8 -/* Permute2 */ - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2B, CPU Feature: AVX512VBMI -func (x Int8x16) Permute2(y Int8x16, indices Uint8x16) Int8x16 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2B, CPU Feature: AVX512VBMI -func (x Uint8x16) Permute2(y Uint8x16, indices Uint8x16) Uint8x16 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2B, CPU Feature: AVX512VBMI -func (x Int8x32) Permute2(y Int8x32, indices Uint8x32) Int8x32 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2B, CPU Feature: AVX512VBMI -func (x Uint8x32) Permute2(y Uint8x32, indices Uint8x32) Uint8x32 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2B, CPU Feature: AVX512VBMI -func (x Int8x64) Permute2(y Int8x64, indices Uint8x64) Int8x64 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2B, CPU Feature: AVX512VBMI -func (x Uint8x64) Permute2(y Uint8x64, indices Uint8x64) Uint8x64 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2W, CPU Feature: AVX512 -func (x Int16x8) Permute2(y Int16x8, indices Uint16x8) Int16x8 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2W, CPU Feature: AVX512 -func (x Uint16x8) Permute2(y Uint16x8, indices Uint16x8) Uint16x8 +/* PermuteOrZero */ -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2W, CPU Feature: AVX512 -func (x Int16x16) Permute2(y Int16x16, indices Uint16x16) Int16x16 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2W, CPU Feature: AVX512 -func (x Uint16x16) Permute2(y Uint16x16, indices Uint16x16) Uint16x16 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2W, CPU Feature: AVX512 -func (x Int16x32) Permute2(y Int16x32, indices Uint16x32) Int16x32 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2W, CPU Feature: AVX512 -func (x Uint16x32) Permute2(y Uint16x32, indices Uint16x32) Uint16x32 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2PS, CPU Feature: AVX512 -func (x Float32x4) Permute2(y Float32x4, indices Uint32x4) Float32x4 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2D, CPU Feature: AVX512 -func (x Int32x4) Permute2(y Int32x4, indices Uint32x4) Int32x4 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2D, CPU Feature: AVX512 -func (x Uint32x4) Permute2(y Uint32x4, indices Uint32x4) Uint32x4 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2PS, CPU Feature: AVX512 -func (x Float32x8) Permute2(y Float32x8, indices Uint32x8) Float32x8 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2D, CPU Feature: AVX512 -func (x Int32x8) Permute2(y Int32x8, indices Uint32x8) Int32x8 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2D, CPU Feature: AVX512 -func (x Uint32x8) Permute2(y Uint32x8, indices Uint32x8) Uint32x8 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2PS, CPU Feature: AVX512 -func (x Float32x16) Permute2(y Float32x16, indices Uint32x16) Float32x16 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2D, CPU Feature: AVX512 -func (x Int32x16) Permute2(y Int32x16, indices Uint32x16) Int32x16 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2D, CPU Feature: AVX512 -func (x Uint32x16) Permute2(y Uint32x16, indices Uint32x16) Uint32x16 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2PD, CPU Feature: AVX512 -func (x Float64x2) Permute2(y Float64x2, indices Uint64x2) Float64x2 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2Q, CPU Feature: AVX512 -func (x Int64x2) Permute2(y Int64x2, indices Uint64x2) Int64x2 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2Q, CPU Feature: AVX512 -func (x Uint64x2) Permute2(y Uint64x2, indices Uint64x2) Uint64x2 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2PD, CPU Feature: AVX512 -func (x Float64x4) Permute2(y Float64x4, indices Uint64x4) Float64x4 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2Q, CPU Feature: AVX512 -func (x Int64x4) Permute2(y Int64x4, indices Uint64x4) Int64x4 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2Q, CPU Feature: AVX512 -func (x Uint64x4) Permute2(y Uint64x4, indices Uint64x4) Uint64x4 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2PD, CPU Feature: AVX512 -func (x Float64x8) Permute2(y Float64x8, indices Uint64x8) Float64x8 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2Q, CPU Feature: AVX512 -func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8 - -// Permute2 performs a full permutation of vector x, y using indices: -// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} -// where xy is x appending y. -// Only the needed bits to represent xy's index are used in indices' elements. -// -// Asm: VPERMI2Q, CPU Feature: AVX512 -func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8 - -/* PermuteConstant */ - -// PermuteConstant performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFD, CPU Feature: AVX -func (x Int32x4) PermuteConstant(indices uint8) Int32x4 - -// PermuteConstant performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFD, CPU Feature: AVX -func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4 - -/* PermuteConstantGrouped */ - -// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFD, CPU Feature: AVX2 -func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8 - -// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFD, CPU Feature: AVX512 -func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16 - -// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFD, CPU Feature: AVX2 -func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8 - -// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFD, CPU Feature: AVX512 -func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16 - -/* PermuteConstantHi */ - -// PermuteConstantHi performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX512 -func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8 - -// PermuteConstantHi performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX -func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4 - -// PermuteConstantHi performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX512 -func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8 - -// PermuteConstantHi performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX -func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4 - -/* PermuteConstantHiGrouped */ - -// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX2 -func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16 - -// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX512 -func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32 - -// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX2 -func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16 - -// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX512 -func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32 - -/* PermuteConstantLo */ - -// PermuteConstantLo performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX512 -func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8 - -// PermuteConstantLo performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX -func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4 - -// PermuteConstantLo performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX512 -func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8 - -// PermuteConstantLo performs a permutation of vector x using constant indices: -// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX -func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4 - -/* PermuteConstantLoGrouped */ - -// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX2 -func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16 - -// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. -// -// Asm: VPSHUFHW, CPU Feature: AVX512 -func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32 - -// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// PermuteOrZero performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// The lower four bits of each byte-sized index in indices select an element from x, +// unless the index's sign bit is set in which case zero is used instead. // -// Asm: VPSHUFHW, CPU Feature: AVX2 -func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16 +// Asm: VPSHUFB, CPU Feature: AVX +func (x Int8x16) PermuteOrZero(indices Int8x16) Int8x16 -// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices: -// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} -// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. -// Each group is of size 128-bit. -// -// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// PermuteOrZero performs a full permutation of vector x using indices: +// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} +// The lower four bits of each byte-sized index in indices select an element from x, +// unless the index's sign bit is set in which case zero is used instead. // -// Asm: VPSHUFHW, CPU Feature: AVX512 -func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32 +// Asm: VPSHUFB, CPU Feature: AVX +func (x Uint8x16) PermuteOrZero(indices Int8x16) Uint8x16 -/* PermuteGrouped */ +/* PermuteOrZeroGrouped */ -// PermuteGrouped performs a grouped permutation of vector x using indices: -// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} -// Only the needed bits to represent the index of a group of x are used in indices' elements. -// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices: +// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x, +// unless the index's sign bit is set in which case zero is used instead. // Each group is of size 128-bit. // // Asm: VPSHUFB, CPU Feature: AVX2 -func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32 +func (x Int8x32) PermuteOrZeroGrouped(indices Int8x32) Int8x32 -// PermuteGrouped performs a grouped permutation of vector x using indices: -// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} -// Only the needed bits to represent the index of a group of x are used in indices' elements. -// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices: +// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x, +// unless the index's sign bit is set in which case zero is used instead. // Each group is of size 128-bit. // // Asm: VPSHUFB, CPU Feature: AVX512 -func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64 +func (x Int8x64) PermuteOrZeroGrouped(indices Int8x64) Int8x64 -// PermuteGrouped performs a grouped permutation of vector x using indices: -// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} -// Only the needed bits to represent the index of a group of x are used in indices' elements. -// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices: +// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x, +// unless the index's sign bit is set in which case zero is used instead. // Each group is of size 128-bit. // // Asm: VPSHUFB, CPU Feature: AVX2 -func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32 +func (x Uint8x32) PermuteOrZeroGrouped(indices Int8x32) Uint8x32 -// PermuteGrouped performs a grouped permutation of vector x using indices: -// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} -// Only the needed bits to represent the index of a group of x are used in indices' elements. -// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. +// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices: +// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} +// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x, +// unless the index's sign bit is set in which case zero is used instead. // Each group is of size 128-bit. // // Asm: VPSHUFB, CPU Feature: AVX512 -func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64 +func (x Uint8x64) PermuteOrZeroGrouped(indices Int8x64) Uint8x64 /* Reciprocal */ @@ -5807,8 +5601,10 @@ func (x Float64x8) Scale(y Float64x8) Float64x8 /* Select128FromPair */ -// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves -// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. @@ -5816,8 +5612,10 @@ func (x Float64x8) Scale(y Float64x8) Float64x8 // Asm: VPERM2F128, CPU Feature: AVX func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8 -// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves -// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. @@ -5825,8 +5623,10 @@ func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8 // Asm: VPERM2F128, CPU Feature: AVX func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4 -// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves -// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. @@ -5834,8 +5634,10 @@ func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4 // Asm: VPERM2I128, CPU Feature: AVX2 func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8 -// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves -// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. @@ -5843,8 +5645,10 @@ func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8 // Asm: VPERM2I128, CPU Feature: AVX2 func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4 -// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves -// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. @@ -5852,8 +5656,10 @@ func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4 // Asm: VPERM2I128, CPU Feature: AVX2 func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8 -// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves -// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. // lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. diff --git a/src/simd/ops_internal_amd64.go b/src/simd/ops_internal_amd64.go index 8be40995f0..63ee6416a6 100644 --- a/src/simd/ops_internal_amd64.go +++ b/src/simd/ops_internal_amd64.go @@ -338,6 +338,220 @@ func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x // Asm: VSHUFPD, CPU Feature: AVX512 func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 +/* permuteScalars */ + +// permuteScalars performs a permutation of vector x using constant indices: +// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX +func (x Int32x4) permuteScalars(indices uint8) Int32x4 + +// permuteScalars performs a permutation of vector x using constant indices: +// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX +func (x Uint32x4) permuteScalars(indices uint8) Uint32x4 + +/* permuteScalarsGrouped */ + +// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: +// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX2 +func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8 + +// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: +// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX512 +func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16 + +// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: +// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX2 +func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8 + +// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices: +// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFD, CPU Feature: AVX512 +func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16 + +/* permuteScalarsHi */ + +// permuteScalarsHi performs a permutation of vector x using constant indices: +// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8 + +// permuteScalarsHi performs a permutation of vector x using constant indices: +// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8 + +/* permuteScalarsHiGrouped */ + +// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: +// result = +// +// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], +// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} +// +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16 + +// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: +// result = +// +// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], +// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} +// +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32 + +// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: +// result = +// +// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], +// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} +// +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16 + +// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices: +// result = +// +// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], +// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...} +// +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32 + +/* permuteScalarsLo */ + +// permuteScalarsLo performs a permutation of vector x using constant indices: +// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFLW, CPU Feature: AVX512 +func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8 + +// permuteScalarsLo performs a permutation of vector x using constant indices: +// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]} +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFLW, CPU Feature: AVX512 +func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8 + +/* permuteScalarsLoGrouped */ + +// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: +// +// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], +// x_group1[indices[0:2]], ...} +// +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFLW, CPU Feature: AVX2 +func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16 + +// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: +// +// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], +// x_group1[indices[0:2]], ...} +// +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFLW, CPU Feature: AVX512 +func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32 + +// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: +// +// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], +// x_group1[indices[0:2]], ...} +// +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFLW, CPU Feature: AVX2 +func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16 + +// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices: +// +// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7], +// x_group1[indices[0:2]], ...} +// +// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index. +// Each group is of size 128-bit. +// +// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPSHUFLW, CPU Feature: AVX512 +func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32 + /* tern */ // tern performs a logical operation on three vectors based on the 8-bit truth table. diff --git a/src/simd/shuffles_amd64.go b/src/simd/shuffles_amd64.go index e0d9db9266..b7472f7020 100644 --- a/src/simd/shuffles_amd64.go +++ b/src/simd/shuffles_amd64.go @@ -989,3 +989,280 @@ func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 { } panic("missing case, switch should be exhaustive") } + +/* PermuteScalars */ + +// PermuteScalars performs a permutation of vector x's elements using the supplied indices: +// +// result = {x[a], x[b], x[c], x[d]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table may be generated. +// +// Asm: VPSHUFD, CPU Feature: AVX +func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 { + return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalars performs a permutation of vector x's elements using the supplied indices: +// +// result = {x[a], x[b], x[c], x[d]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table may be generated. +// +// Asm: VPSHUFD, CPU Feature: AVX +func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 { + return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +/* PermuteScalarsGrouped */ + +// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table may be generated. +// +// Asm: VPSHUFD, CPU Feature: AVX2 +func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 { + return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4], +// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table may be generated. +// +// Asm: VPSHUFD, CPU Feature: AVX512 +func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 { + return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFD, CPU Feature: AVX2 +func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 { + return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4], +// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFD, CPU Feature: AVX512 +func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 { + return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +/* PermuteScalarsHi */ + +// PermuteScalarsHi performs a permutation of vector x using the supplied indices: +// +// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 { + return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsHi performs a permutation of vector x using the supplied indices: +// +// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 { + return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +/* PermuteScalarsHiGrouped */ + +// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4], +// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 { + return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4], +// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12], +// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20], +// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 { + return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4], +// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]} +// +// Each group is of size 128-bit. +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFHW, CPU Feature: AVX2 +func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 { + return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// { x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4], +// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12], +// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20], +// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFHW, CPU Feature: AVX512 +func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 { + return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +/* PermuteScalarsLo */ + +// PermuteScalarsLo performs a permutation of vector x using the supplied indices: +// +// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFLW, CPU Feature: AVX512 +func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 { + return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsLo performs a permutation of vector x using the supplied indices: +// +// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFLW, CPU Feature: AVX512 +func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 { + return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +/* PermuteScalarsLoGrouped */ + +// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7], +// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFLW, CPU Feature: AVX2 +func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 { + return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7], +// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15], +// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23], +// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFLW, CPU Feature: AVX512 +func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 { + return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7], +// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]} +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFLW, CPU Feature: AVX2 +func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 { + return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} + +// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices: +// +// result = +// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7], +// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15], +// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23], +// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]} +// +// Each group is of size 128-bit. +// +// Parameters a,b,c,d should have values between 0 and 3. +// If a through d are constants, then an instruction will be inlined, otherwise +// a jump table is generated. +// +// Asm: VPSHUFLW, CPU Feature: AVX512 +func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 { + return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6) +} -- 2.52.0