From cf31b1563534d6c4f8d2be87cbfdebd6e61ad479 Mon Sep 17 00:00:00 2001 From: David Chase Date: Tue, 19 Aug 2025 17:54:38 -0400 Subject: [PATCH] [dev.simd] simd, cmd/compile: added .Masked() peephole opt for many operations. This should get many of the low-hanging and important fruit. Others can follow later. It needs more testing. Change-Id: Ic186b075987e85c87197ef9e1ca0b4f33ff96697 Reviewed-on: https://go-review.googlesource.com/c/go/+/697515 Reviewed-by: Junyang Shao Commit-Queue: David Chase TryBot-Bypass: David Chase --- .../compile/internal/ssa/_gen/simdAMD64.rules | 181 ++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 2656 ++++++++++++++++- src/simd/_gen/simdgen/gen_simdrules.go | 63 +- src/simd/simd_test.go | 33 + 4 files changed, 2888 insertions(+), 45 deletions(-) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 1be54c7382..d5be221c0e 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -851,6 +851,15 @@ (ShiftAllLeftConcatUint64x2 ...) => (VPSHLDQ128 ...) (ShiftAllLeftConcatUint64x4 ...) => (VPSHLDQ256 ...) (ShiftAllLeftConcatUint64x8 ...) => (VPSHLDQ512 ...) +(VPSLLWMasked128 x (MOVQconst [c]) mask) => (VPSLLWMasked128const [uint8(c)] x mask) +(VPSLLWMasked256 x (MOVQconst [c]) mask) => (VPSLLWMasked256const [uint8(c)] x mask) +(VPSLLWMasked512 x (MOVQconst [c]) mask) => (VPSLLWMasked512const [uint8(c)] x mask) +(VPSLLDMasked128 x (MOVQconst [c]) mask) => (VPSLLDMasked128const [uint8(c)] x mask) +(VPSLLDMasked256 x (MOVQconst [c]) mask) => (VPSLLDMasked256const [uint8(c)] x mask) +(VPSLLDMasked512 x (MOVQconst [c]) mask) => (VPSLLDMasked512const [uint8(c)] x mask) +(VPSLLQMasked128 x (MOVQconst [c]) mask) => (VPSLLQMasked128const [uint8(c)] x mask) +(VPSLLQMasked256 x (MOVQconst [c]) mask) => (VPSLLQMasked256const [uint8(c)] x mask) +(VPSLLQMasked512 x (MOVQconst [c]) mask) => (VPSLLQMasked512const [uint8(c)] x mask) (ShiftAllRightInt16x8 ...) => (VPSRAW128 ...) (VPSRAW128 x (MOVQconst [c])) => (VPSRAW128const [uint8(c)] x) (ShiftAllRightInt16x16 ...) => (VPSRAW256 ...) @@ -896,6 +905,15 @@ (ShiftAllRightConcatUint64x2 ...) => (VPSHRDQ128 ...) (ShiftAllRightConcatUint64x4 ...) => (VPSHRDQ256 ...) (ShiftAllRightConcatUint64x8 ...) => (VPSHRDQ512 ...) +(VPSRAWMasked128 x (MOVQconst [c]) mask) => (VPSRAWMasked128const [uint8(c)] x mask) +(VPSRAWMasked256 x (MOVQconst [c]) mask) => (VPSRAWMasked256const [uint8(c)] x mask) +(VPSRAWMasked512 x (MOVQconst [c]) mask) => (VPSRAWMasked512const [uint8(c)] x mask) +(VPSRADMasked128 x (MOVQconst [c]) mask) => (VPSRADMasked128const [uint8(c)] x mask) +(VPSRADMasked256 x (MOVQconst [c]) mask) => (VPSRADMasked256const [uint8(c)] x mask) +(VPSRADMasked512 x (MOVQconst [c]) mask) => (VPSRADMasked512const [uint8(c)] x mask) +(VPSRAQMasked128 x (MOVQconst [c]) mask) => (VPSRAQMasked128const [uint8(c)] x mask) +(VPSRAQMasked256 x (MOVQconst [c]) mask) => (VPSRAQMasked256const [uint8(c)] x mask) +(VPSRAQMasked512 x (MOVQconst [c]) mask) => (VPSRAQMasked512const [uint8(c)] x mask) (ShiftLeftInt16x8 ...) => (VPSLLVW128 ...) (ShiftLeftInt16x16 ...) => (VPSLLVW256 ...) (ShiftLeftInt16x32 ...) => (VPSLLVW512 ...) @@ -1086,3 +1104,166 @@ (moveMaskedUint16x32 x mask) => (VMOVDQU16Masked512 x (VPMOVVec16x32ToM mask)) (moveMaskedUint32x16 x mask) => (VMOVDQU32Masked512 x (VPMOVVec32x16ToM mask)) (moveMaskedUint64x8 x mask) => (VMOVDQU64Masked512 x (VPMOVVec64x8ToM mask)) +(VMOVDQU8Masked512 (VPABSB512 x) mask) => (VPABSBMasked512 x mask) +(VMOVDQU16Masked512 (VPABSW512 x) mask) => (VPABSWMasked512 x mask) +(VMOVDQU32Masked512 (VPABSD512 x) mask) => (VPABSDMasked512 x mask) +(VMOVDQU64Masked512 (VPABSQ512 x) mask) => (VPABSQMasked512 x mask) +(VMOVDQU32Masked512 (VPDPWSSD512 x y z) mask) => (VPDPWSSDMasked512 x y z mask) +(VMOVDQU32Masked512 (VPDPWSSDS512 x y z) mask) => (VPDPWSSDSMasked512 x y z mask) +(VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) => (VPDPBUSDMasked512 x y z mask) +(VMOVDQU32Masked512 (VPDPBUSDS512 x y z) mask) => (VPDPBUSDSMasked512 x y z mask) +(VMOVDQU32Masked512 (VADDPS512 x y) mask) => (VADDPSMasked512 x y mask) +(VMOVDQU64Masked512 (VADDPD512 x y) mask) => (VADDPDMasked512 x y mask) +(VMOVDQU8Masked512 (VPADDB512 x y) mask) => (VPADDBMasked512 x y mask) +(VMOVDQU16Masked512 (VPADDW512 x y) mask) => (VPADDWMasked512 x y mask) +(VMOVDQU32Masked512 (VPADDD512 x y) mask) => (VPADDDMasked512 x y mask) +(VMOVDQU64Masked512 (VPADDQ512 x y) mask) => (VPADDQMasked512 x y mask) +(VMOVDQU8Masked512 (VPADDSB512 x y) mask) => (VPADDSBMasked512 x y mask) +(VMOVDQU16Masked512 (VPADDSW512 x y) mask) => (VPADDSWMasked512 x y mask) +(VMOVDQU8Masked512 (VPADDUSB512 x y) mask) => (VPADDUSBMasked512 x y mask) +(VMOVDQU16Masked512 (VPADDUSW512 x y) mask) => (VPADDUSWMasked512 x y mask) +(VMOVDQU32Masked512 (VPANDD512 x y) mask) => (VPANDDMasked512 x y mask) +(VMOVDQU64Masked512 (VPANDQ512 x y) mask) => (VPANDQMasked512 x y mask) +(VMOVDQU32Masked512 (VPANDND512 x y) mask) => (VPANDNDMasked512 x y mask) +(VMOVDQU64Masked512 (VPANDNQ512 x y) mask) => (VPANDNQMasked512 x y mask) +(VMOVDQU8Masked512 (VPAVGB512 x y) mask) => (VPAVGBMasked512 x y mask) +(VMOVDQU16Masked512 (VPAVGW512 x y) mask) => (VPAVGWMasked512 x y mask) +(VMOVDQU32Masked512 (VBROADCASTSS512 x) mask) => (VBROADCASTSSMasked512 x mask) +(VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) => (VBROADCASTSDMasked512 x mask) +(VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) => (VPBROADCASTBMasked512 x mask) +(VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) => (VPBROADCASTWMasked512 x mask) +(VMOVDQU32Masked512 (VPBROADCASTD512 x) mask) => (VPBROADCASTDMasked512 x mask) +(VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) => (VPBROADCASTQMasked512 x mask) +(VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask) => (VRNDSCALEPSMasked512 [a] x mask) +(VMOVDQU64Masked512 (VRNDSCALEPD512 [a] x) mask) => (VRNDSCALEPDMasked512 [a] x mask) +(VMOVDQU32Masked512 (VREDUCEPS512 [a] x) mask) => (VREDUCEPSMasked512 [a] x mask) +(VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask) +(VMOVDQU32Masked512 (VCVTTPS2DQ512 x) mask) => (VCVTTPS2DQMasked512 x mask) +(VMOVDQU8Masked512 (VPMOVZXBW512 x) mask) => (VPMOVZXBWMasked512 x mask) +(VMOVDQU32Masked512 (VCVTPS2UDQ512 x) mask) => (VCVTPS2UDQMasked512 x mask) +(VMOVDQU16Masked512 (VPMOVZXWD512 x) mask) => (VPMOVZXWDMasked512 x mask) +(VMOVDQU32Masked512 (VDIVPS512 x y) mask) => (VDIVPSMasked512 x y mask) +(VMOVDQU64Masked512 (VDIVPD512 x y) mask) => (VDIVPDMasked512 x y mask) +(VMOVDQU16Masked512 (VPMADDWD512 x y) mask) => (VPMADDWDMasked512 x y mask) +(VMOVDQU16Masked512 (VPMADDUBSW512 x y) mask) => (VPMADDUBSWMasked512 x y mask) +(VMOVDQU8Masked512 (VGF2P8AFFINEINVQB512 [a] x y) mask) => (VGF2P8AFFINEINVQBMasked512 [a] x y mask) +(VMOVDQU8Masked512 (VGF2P8AFFINEQB512 [a] x y) mask) => (VGF2P8AFFINEQBMasked512 [a] x y mask) +(VMOVDQU8Masked512 (VGF2P8MULB512 x y) mask) => (VGF2P8MULBMasked512 x y mask) +(VMOVDQU32Masked512 (VMAXPS512 x y) mask) => (VMAXPSMasked512 x y mask) +(VMOVDQU64Masked512 (VMAXPD512 x y) mask) => (VMAXPDMasked512 x y mask) +(VMOVDQU8Masked512 (VPMAXSB512 x y) mask) => (VPMAXSBMasked512 x y mask) +(VMOVDQU16Masked512 (VPMAXSW512 x y) mask) => (VPMAXSWMasked512 x y mask) +(VMOVDQU32Masked512 (VPMAXSD512 x y) mask) => (VPMAXSDMasked512 x y mask) +(VMOVDQU64Masked512 (VPMAXSQ512 x y) mask) => (VPMAXSQMasked512 x y mask) +(VMOVDQU8Masked512 (VPMAXUB512 x y) mask) => (VPMAXUBMasked512 x y mask) +(VMOVDQU16Masked512 (VPMAXUW512 x y) mask) => (VPMAXUWMasked512 x y mask) +(VMOVDQU32Masked512 (VPMAXUD512 x y) mask) => (VPMAXUDMasked512 x y mask) +(VMOVDQU64Masked512 (VPMAXUQ512 x y) mask) => (VPMAXUQMasked512 x y mask) +(VMOVDQU32Masked512 (VMINPS512 x y) mask) => (VMINPSMasked512 x y mask) +(VMOVDQU64Masked512 (VMINPD512 x y) mask) => (VMINPDMasked512 x y mask) +(VMOVDQU8Masked512 (VPMINSB512 x y) mask) => (VPMINSBMasked512 x y mask) +(VMOVDQU16Masked512 (VPMINSW512 x y) mask) => (VPMINSWMasked512 x y mask) +(VMOVDQU32Masked512 (VPMINSD512 x y) mask) => (VPMINSDMasked512 x y mask) +(VMOVDQU64Masked512 (VPMINSQ512 x y) mask) => (VPMINSQMasked512 x y mask) +(VMOVDQU8Masked512 (VPMINUB512 x y) mask) => (VPMINUBMasked512 x y mask) +(VMOVDQU16Masked512 (VPMINUW512 x y) mask) => (VPMINUWMasked512 x y mask) +(VMOVDQU32Masked512 (VPMINUD512 x y) mask) => (VPMINUDMasked512 x y mask) +(VMOVDQU64Masked512 (VPMINUQ512 x y) mask) => (VPMINUQMasked512 x y mask) +(VMOVDQU32Masked512 (VFMADD213PS512 x y z) mask) => (VFMADD213PSMasked512 x y z mask) +(VMOVDQU64Masked512 (VFMADD213PD512 x y z) mask) => (VFMADD213PDMasked512 x y z mask) +(VMOVDQU32Masked512 (VFMADDSUB213PS512 x y z) mask) => (VFMADDSUB213PSMasked512 x y z mask) +(VMOVDQU64Masked512 (VFMADDSUB213PD512 x y z) mask) => (VFMADDSUB213PDMasked512 x y z mask) +(VMOVDQU16Masked512 (VPMULHW512 x y) mask) => (VPMULHWMasked512 x y mask) +(VMOVDQU16Masked512 (VPMULHUW512 x y) mask) => (VPMULHUWMasked512 x y mask) +(VMOVDQU32Masked512 (VMULPS512 x y) mask) => (VMULPSMasked512 x y mask) +(VMOVDQU64Masked512 (VMULPD512 x y) mask) => (VMULPDMasked512 x y mask) +(VMOVDQU16Masked512 (VPMULLW512 x y) mask) => (VPMULLWMasked512 x y mask) +(VMOVDQU32Masked512 (VPMULLD512 x y) mask) => (VPMULLDMasked512 x y mask) +(VMOVDQU64Masked512 (VPMULLQ512 x y) mask) => (VPMULLQMasked512 x y mask) +(VMOVDQU32Masked512 (VFMSUBADD213PS512 x y z) mask) => (VFMSUBADD213PSMasked512 x y z mask) +(VMOVDQU64Masked512 (VFMSUBADD213PD512 x y z) mask) => (VFMSUBADD213PDMasked512 x y z mask) +(VMOVDQU8Masked512 (VPOPCNTB512 x) mask) => (VPOPCNTBMasked512 x mask) +(VMOVDQU16Masked512 (VPOPCNTW512 x) mask) => (VPOPCNTWMasked512 x mask) +(VMOVDQU32Masked512 (VPOPCNTD512 x) mask) => (VPOPCNTDMasked512 x mask) +(VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask) +(VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask) +(VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask) +(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask) +(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask) +(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask) +(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask) +(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask) +(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask) +(VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask) +(VMOVDQU16Masked512 (VPERMW512 x y) mask) => (VPERMWMasked512 x y mask) +(VMOVDQU32Masked512 (VPERMPS512 x y) mask) => (VPERMPSMasked512 x y mask) +(VMOVDQU32Masked512 (VPERMD512 x y) mask) => (VPERMDMasked512 x y mask) +(VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask) +(VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask) +(VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask) +(VMOVDQU64Masked512 (VRCP14PD512 x) mask) => (VRCP14PDMasked512 x mask) +(VMOVDQU32Masked512 (VRSQRT14PS512 x) mask) => (VRSQRT14PSMasked512 x mask) +(VMOVDQU64Masked512 (VRSQRT14PD512 x) mask) => (VRSQRT14PDMasked512 x mask) +(VMOVDQU32Masked512 (VPROLD512 [a] x) mask) => (VPROLDMasked512 [a] x mask) +(VMOVDQU64Masked512 (VPROLQ512 [a] x) mask) => (VPROLQMasked512 [a] x mask) +(VMOVDQU32Masked512 (VPRORD512 [a] x) mask) => (VPRORDMasked512 [a] x mask) +(VMOVDQU64Masked512 (VPRORQ512 [a] x) mask) => (VPRORQMasked512 [a] x mask) +(VMOVDQU32Masked512 (VPROLVD512 x y) mask) => (VPROLVDMasked512 x y mask) +(VMOVDQU64Masked512 (VPROLVQ512 x y) mask) => (VPROLVQMasked512 x y mask) +(VMOVDQU32Masked512 (VPRORVD512 x y) mask) => (VPRORVDMasked512 x y mask) +(VMOVDQU64Masked512 (VPRORVQ512 x y) mask) => (VPRORVQMasked512 x y mask) +(VMOVDQU32Masked512 (VSCALEFPS512 x y) mask) => (VSCALEFPSMasked512 x y mask) +(VMOVDQU64Masked512 (VSCALEFPD512 x y) mask) => (VSCALEFPDMasked512 x y mask) +(VMOVDQU16Masked512 (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512 [a] x y mask) +(VMOVDQU32Masked512 (VPSHLDD512 [a] x y) mask) => (VPSHLDDMasked512 [a] x y mask) +(VMOVDQU64Masked512 (VPSHLDQ512 [a] x y) mask) => (VPSHLDQMasked512 [a] x y mask) +(VMOVDQU16Masked512 (VPSLLW512 x y) mask) => (VPSLLWMasked512 x y mask) +(VMOVDQU32Masked512 (VPSLLD512 x y) mask) => (VPSLLDMasked512 x y mask) +(VMOVDQU64Masked512 (VPSLLQ512 x y) mask) => (VPSLLQMasked512 x y mask) +(VMOVDQU16Masked512 (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512 [a] x y mask) +(VMOVDQU32Masked512 (VPSHRDD512 [a] x y) mask) => (VPSHRDDMasked512 [a] x y mask) +(VMOVDQU64Masked512 (VPSHRDQ512 [a] x y) mask) => (VPSHRDQMasked512 [a] x y mask) +(VMOVDQU16Masked512 (VPSRAW512 x y) mask) => (VPSRAWMasked512 x y mask) +(VMOVDQU32Masked512 (VPSRAD512 x y) mask) => (VPSRADMasked512 x y mask) +(VMOVDQU64Masked512 (VPSRAQ512 x y) mask) => (VPSRAQMasked512 x y mask) +(VMOVDQU16Masked512 (VPSRLW512 x y) mask) => (VPSRLWMasked512 x y mask) +(VMOVDQU32Masked512 (VPSRLD512 x y) mask) => (VPSRLDMasked512 x y mask) +(VMOVDQU64Masked512 (VPSRLQ512 x y) mask) => (VPSRLQMasked512 x y mask) +(VMOVDQU16Masked512 (VPSHLDVW512 x y z) mask) => (VPSHLDVWMasked512 x y z mask) +(VMOVDQU32Masked512 (VPSHLDVD512 x y z) mask) => (VPSHLDVDMasked512 x y z mask) +(VMOVDQU64Masked512 (VPSHLDVQ512 x y z) mask) => (VPSHLDVQMasked512 x y z mask) +(VMOVDQU16Masked512 (VPSLLVW512 x y) mask) => (VPSLLVWMasked512 x y mask) +(VMOVDQU32Masked512 (VPSLLVD512 x y) mask) => (VPSLLVDMasked512 x y mask) +(VMOVDQU64Masked512 (VPSLLVQ512 x y) mask) => (VPSLLVQMasked512 x y mask) +(VMOVDQU16Masked512 (VPSHRDVW512 x y z) mask) => (VPSHRDVWMasked512 x y z mask) +(VMOVDQU32Masked512 (VPSHRDVD512 x y z) mask) => (VPSHRDVDMasked512 x y z mask) +(VMOVDQU64Masked512 (VPSHRDVQ512 x y z) mask) => (VPSHRDVQMasked512 x y z mask) +(VMOVDQU16Masked512 (VPSRAVW512 x y) mask) => (VPSRAVWMasked512 x y mask) +(VMOVDQU32Masked512 (VPSRAVD512 x y) mask) => (VPSRAVDMasked512 x y mask) +(VMOVDQU64Masked512 (VPSRAVQ512 x y) mask) => (VPSRAVQMasked512 x y mask) +(VMOVDQU16Masked512 (VPSRLVW512 x y) mask) => (VPSRLVWMasked512 x y mask) +(VMOVDQU32Masked512 (VPSRLVD512 x y) mask) => (VPSRLVDMasked512 x y mask) +(VMOVDQU64Masked512 (VPSRLVQ512 x y) mask) => (VPSRLVQMasked512 x y mask) +(VMOVDQU32Masked512 (VSQRTPS512 x) mask) => (VSQRTPSMasked512 x mask) +(VMOVDQU64Masked512 (VSQRTPD512 x) mask) => (VSQRTPDMasked512 x mask) +(VMOVDQU32Masked512 (VSUBPS512 x y) mask) => (VSUBPSMasked512 x y mask) +(VMOVDQU64Masked512 (VSUBPD512 x y) mask) => (VSUBPDMasked512 x y mask) +(VMOVDQU8Masked512 (VPSUBB512 x y) mask) => (VPSUBBMasked512 x y mask) +(VMOVDQU16Masked512 (VPSUBW512 x y) mask) => (VPSUBWMasked512 x y mask) +(VMOVDQU32Masked512 (VPSUBD512 x y) mask) => (VPSUBDMasked512 x y mask) +(VMOVDQU64Masked512 (VPSUBQ512 x y) mask) => (VPSUBQMasked512 x y mask) +(VMOVDQU8Masked512 (VPSUBSB512 x y) mask) => (VPSUBSBMasked512 x y mask) +(VMOVDQU16Masked512 (VPSUBSW512 x y) mask) => (VPSUBSWMasked512 x y mask) +(VMOVDQU8Masked512 (VPSUBUSB512 x y) mask) => (VPSUBUSBMasked512 x y mask) +(VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask) +(VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask) +(VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask) +(VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask) +(VMOVDQU32Masked512 (VPSLLD512const [a] x) mask) => (VPSLLDMasked512const [a] x mask) +(VMOVDQU64Masked512 (VPSLLQ512const [a] x) mask) => (VPSLLQMasked512const [a] x mask) +(VMOVDQU16Masked512 (VPSRLW512const [a] x) mask) => (VPSRLWMasked512const [a] x mask) +(VMOVDQU32Masked512 (VPSRLD512const [a] x) mask) => (VPSRLDMasked512const [a] x mask) +(VMOVDQU64Masked512 (VPSRLQ512const [a] x) mask) => (VPSRLQMasked512const [a] x mask) +(VMOVDQU16Masked512 (VPSRAW512const [a] x) mask) => (VPSRAWMasked512const [a] x mask) +(VMOVDQU32Masked512 (VPSRAD512const [a] x) mask) => (VPSRADMasked512const [a] x mask) +(VMOVDQU64Masked512 (VPSRAQ512const [a] x) mask) => (VPSRAQMasked512const [a] x mask) diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 78c1ddd9dc..924fc2ecf6 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -507,6 +507,14 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64TESTW(v) case OpAMD64TESTWconst: return rewriteValueAMD64_OpAMD64TESTWconst(v) + case OpAMD64VMOVDQU16Masked512: + return rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v) + case OpAMD64VMOVDQU32Masked512: + return rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v) + case OpAMD64VMOVDQU64Masked512: + return rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v) + case OpAMD64VMOVDQU8Masked512: + return rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v) case OpAMD64VPANDQ512: return rewriteValueAMD64_OpAMD64VPANDQ512(v) case OpAMD64VPMOVVec16x16ToM: @@ -539,36 +547,72 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64VPSLLD256(v) case OpAMD64VPSLLD512: return rewriteValueAMD64_OpAMD64VPSLLD512(v) + case OpAMD64VPSLLDMasked128: + return rewriteValueAMD64_OpAMD64VPSLLDMasked128(v) + case OpAMD64VPSLLDMasked256: + return rewriteValueAMD64_OpAMD64VPSLLDMasked256(v) + case OpAMD64VPSLLDMasked512: + return rewriteValueAMD64_OpAMD64VPSLLDMasked512(v) case OpAMD64VPSLLQ128: return rewriteValueAMD64_OpAMD64VPSLLQ128(v) case OpAMD64VPSLLQ256: return rewriteValueAMD64_OpAMD64VPSLLQ256(v) case OpAMD64VPSLLQ512: return rewriteValueAMD64_OpAMD64VPSLLQ512(v) + case OpAMD64VPSLLQMasked128: + return rewriteValueAMD64_OpAMD64VPSLLQMasked128(v) + case OpAMD64VPSLLQMasked256: + return rewriteValueAMD64_OpAMD64VPSLLQMasked256(v) + case OpAMD64VPSLLQMasked512: + return rewriteValueAMD64_OpAMD64VPSLLQMasked512(v) case OpAMD64VPSLLW128: return rewriteValueAMD64_OpAMD64VPSLLW128(v) case OpAMD64VPSLLW256: return rewriteValueAMD64_OpAMD64VPSLLW256(v) case OpAMD64VPSLLW512: return rewriteValueAMD64_OpAMD64VPSLLW512(v) + case OpAMD64VPSLLWMasked128: + return rewriteValueAMD64_OpAMD64VPSLLWMasked128(v) + case OpAMD64VPSLLWMasked256: + return rewriteValueAMD64_OpAMD64VPSLLWMasked256(v) + case OpAMD64VPSLLWMasked512: + return rewriteValueAMD64_OpAMD64VPSLLWMasked512(v) case OpAMD64VPSRAD128: return rewriteValueAMD64_OpAMD64VPSRAD128(v) case OpAMD64VPSRAD256: return rewriteValueAMD64_OpAMD64VPSRAD256(v) case OpAMD64VPSRAD512: return rewriteValueAMD64_OpAMD64VPSRAD512(v) + case OpAMD64VPSRADMasked128: + return rewriteValueAMD64_OpAMD64VPSRADMasked128(v) + case OpAMD64VPSRADMasked256: + return rewriteValueAMD64_OpAMD64VPSRADMasked256(v) + case OpAMD64VPSRADMasked512: + return rewriteValueAMD64_OpAMD64VPSRADMasked512(v) case OpAMD64VPSRAQ128: return rewriteValueAMD64_OpAMD64VPSRAQ128(v) case OpAMD64VPSRAQ256: return rewriteValueAMD64_OpAMD64VPSRAQ256(v) case OpAMD64VPSRAQ512: return rewriteValueAMD64_OpAMD64VPSRAQ512(v) + case OpAMD64VPSRAQMasked128: + return rewriteValueAMD64_OpAMD64VPSRAQMasked128(v) + case OpAMD64VPSRAQMasked256: + return rewriteValueAMD64_OpAMD64VPSRAQMasked256(v) + case OpAMD64VPSRAQMasked512: + return rewriteValueAMD64_OpAMD64VPSRAQMasked512(v) case OpAMD64VPSRAW128: return rewriteValueAMD64_OpAMD64VPSRAW128(v) case OpAMD64VPSRAW256: return rewriteValueAMD64_OpAMD64VPSRAW256(v) case OpAMD64VPSRAW512: return rewriteValueAMD64_OpAMD64VPSRAW512(v) + case OpAMD64VPSRAWMasked128: + return rewriteValueAMD64_OpAMD64VPSRAWMasked128(v) + case OpAMD64VPSRAWMasked256: + return rewriteValueAMD64_OpAMD64VPSRAWMasked256(v) + case OpAMD64VPSRAWMasked512: + return rewriteValueAMD64_OpAMD64VPSRAWMasked512(v) case OpAMD64XADDLlock: return rewriteValueAMD64_OpAMD64XADDLlock(v) case OpAMD64XADDQlock: @@ -25971,6 +26015,2176 @@ func rewriteValueAMD64_OpAMD64TESTWconst(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQU16Masked512 (VPABSW512 x) mask) + // result: (VPABSWMasked512 x mask) + for { + if v_0.Op != OpAMD64VPABSW512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPABSWMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPADDW512 x y) mask) + // result: (VPADDWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPADDW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPADDWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPADDSW512 x y) mask) + // result: (VPADDSWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPADDSW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPADDSWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPADDUSW512 x y) mask) + // result: (VPADDUSWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPADDUSW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPADDUSWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPAVGW512 x y) mask) + // result: (VPAVGWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPAVGW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPAVGWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPBROADCASTW512 x) mask) + // result: (VPBROADCASTWMasked512 x mask) + for { + if v_0.Op != OpAMD64VPBROADCASTW512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPBROADCASTWMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMOVZXWD512 x) mask) + // result: (VPMOVZXWDMasked512 x mask) + for { + if v_0.Op != OpAMD64VPMOVZXWD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVZXWDMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMADDWD512 x y) mask) + // result: (VPMADDWDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMADDWD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMADDWDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMADDUBSW512 x y) mask) + // result: (VPMADDUBSWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMADDUBSW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMADDUBSWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMAXSW512 x y) mask) + // result: (VPMAXSWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMAXSW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMAXSWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMAXUW512 x y) mask) + // result: (VPMAXUWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMAXUW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMAXUWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMINSW512 x y) mask) + // result: (VPMINSWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMINSW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMINSWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMINUW512 x y) mask) + // result: (VPMINUWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMINUW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMINUWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMULHW512 x y) mask) + // result: (VPMULHWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMULHW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMULHWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMULHUW512 x y) mask) + // result: (VPMULHUWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMULHUW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMULHUWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPMULLW512 x y) mask) + // result: (VPMULLWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMULLW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMULLWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPOPCNTW512 x) mask) + // result: (VPOPCNTWMasked512 x mask) + for { + if v_0.Op != OpAMD64VPOPCNTW512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPOPCNTWMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) + // result: (VPERMI2WMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2W512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2WMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPERMW512 x y) mask) + // result: (VPERMWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPERMW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPERMWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSHLDW512 [a] x y) mask) + // result: (VPSHLDWMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VPSHLDW512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHLDWMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSLLW512 x y) mask) + // result: (VPSLLWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSLLW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSHRDW512 [a] x y) mask) + // result: (VPSHRDWMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VPSHRDW512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHRDWMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSRAW512 x y) mask) + // result: (VPSRAWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRAW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRAWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSRLW512 x y) mask) + // result: (VPSRLWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRLW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSHLDVW512 x y z) mask) + // result: (VPSHLDVWMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPSHLDVW512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPSHLDVWMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSLLVW512 x y) mask) + // result: (VPSLLVWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSLLVW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLVWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSHRDVW512 x y z) mask) + // result: (VPSHRDVWMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPSHRDVW512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPSHRDVWMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSRAVW512 x y) mask) + // result: (VPSRAVWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRAVW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRAVWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSRLVW512 x y) mask) + // result: (VPSRLVWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRLVW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLVWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSUBW512 x y) mask) + // result: (VPSUBWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSUBW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSUBWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSUBSW512 x y) mask) + // result: (VPSUBSWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSUBSW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSUBSWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) + // result: (VPSUBUSWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSUBUSW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSUBUSWMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) + // result: (VPSLLWMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSLLW512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLWMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSRLW512const [a] x) mask) + // result: (VPSRLWMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSRLW512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLWMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU16Masked512 (VPSRAW512const [a] x) mask) + // result: (VPSRAWMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSRAW512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRAWMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQU32Masked512 (VPABSD512 x) mask) + // result: (VPABSDMasked512 x mask) + for { + if v_0.Op != OpAMD64VPABSD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPABSDMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPDPWSSD512 x y z) mask) + // result: (VPDPWSSDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPDPWSSD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPDPWSSDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPDPWSSDS512 x y z) mask) + // result: (VPDPWSSDSMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPDPWSSDS512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPDPWSSDSMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) + // result: (VPDPBUSDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPDPBUSD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPDPBUSDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPDPBUSDS512 x y z) mask) + // result: (VPDPBUSDSMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPDPBUSDS512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPDPBUSDSMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VADDPS512 x y) mask) + // result: (VADDPSMasked512 x y mask) + for { + if v_0.Op != OpAMD64VADDPS512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VADDPSMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPADDD512 x y) mask) + // result: (VPADDDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPADDD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPADDDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPANDD512 x y) mask) + // result: (VPANDDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPANDD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPANDDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPANDND512 x y) mask) + // result: (VPANDNDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPANDND512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPANDNDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VBROADCASTSS512 x) mask) + // result: (VBROADCASTSSMasked512 x mask) + for { + if v_0.Op != OpAMD64VBROADCASTSS512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VBROADCASTSSMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPBROADCASTD512 x) mask) + // result: (VPBROADCASTDMasked512 x mask) + for { + if v_0.Op != OpAMD64VPBROADCASTD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPBROADCASTDMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VRNDSCALEPS512 [a] x) mask) + // result: (VRNDSCALEPSMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VRNDSCALEPS512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VRNDSCALEPSMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VREDUCEPS512 [a] x) mask) + // result: (VREDUCEPSMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VREDUCEPS512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VCVTTPS2DQ512 x) mask) + // result: (VCVTTPS2DQMasked512 x mask) + for { + if v_0.Op != OpAMD64VCVTTPS2DQ512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VCVTTPS2DQMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VCVTPS2UDQ512 x) mask) + // result: (VCVTPS2UDQMasked512 x mask) + for { + if v_0.Op != OpAMD64VCVTPS2UDQ512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VCVTPS2UDQMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VDIVPS512 x y) mask) + // result: (VDIVPSMasked512 x y mask) + for { + if v_0.Op != OpAMD64VDIVPS512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VDIVPSMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VMAXPS512 x y) mask) + // result: (VMAXPSMasked512 x y mask) + for { + if v_0.Op != OpAMD64VMAXPS512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VMAXPSMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPMAXSD512 x y) mask) + // result: (VPMAXSDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMAXSD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMAXSDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPMAXUD512 x y) mask) + // result: (VPMAXUDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMAXUD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMAXUDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VMINPS512 x y) mask) + // result: (VMINPSMasked512 x y mask) + for { + if v_0.Op != OpAMD64VMINPS512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VMINPSMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPMINSD512 x y) mask) + // result: (VPMINSDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMINSD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMINSDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPMINUD512 x y) mask) + // result: (VPMINUDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMINUD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMINUDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VFMADD213PS512 x y z) mask) + // result: (VFMADD213PSMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VFMADD213PS512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VFMADD213PSMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VFMADDSUB213PS512 x y z) mask) + // result: (VFMADDSUB213PSMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VFMADDSUB213PS512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VFMADDSUB213PSMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VMULPS512 x y) mask) + // result: (VMULPSMasked512 x y mask) + for { + if v_0.Op != OpAMD64VMULPS512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VMULPSMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPMULLD512 x y) mask) + // result: (VPMULLDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMULLD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMULLDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VFMSUBADD213PS512 x y z) mask) + // result: (VFMSUBADD213PSMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VFMSUBADD213PS512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VFMSUBADD213PSMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPOPCNTD512 x) mask) + // result: (VPOPCNTDMasked512 x mask) + for { + if v_0.Op != OpAMD64VPOPCNTD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPOPCNTDMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPORD512 x y) mask) + // result: (VPORDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPORD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPORDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) + // result: (VPERMI2PSMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2PS512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2PSMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) + // result: (VPERMI2DMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2D512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2DMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask) + // result: (VPERMPSMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPERMPS512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPERMPSMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPERMD512 x y) mask) + // result: (VPERMDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPERMD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPERMDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VRCP14PS512 x) mask) + // result: (VRCP14PSMasked512 x mask) + for { + if v_0.Op != OpAMD64VRCP14PS512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VRCP14PSMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VRSQRT14PS512 x) mask) + // result: (VRSQRT14PSMasked512 x mask) + for { + if v_0.Op != OpAMD64VRSQRT14PS512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VRSQRT14PSMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPROLD512 [a] x) mask) + // result: (VPROLDMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPROLD512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPROLDMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPRORD512 [a] x) mask) + // result: (VPRORDMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPRORD512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPRORDMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPROLVD512 x y) mask) + // result: (VPROLVDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPROLVD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPROLVDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPRORVD512 x y) mask) + // result: (VPRORVDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPRORVD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPRORVDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VSCALEFPS512 x y) mask) + // result: (VSCALEFPSMasked512 x y mask) + for { + if v_0.Op != OpAMD64VSCALEFPS512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VSCALEFPSMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSHLDD512 [a] x y) mask) + // result: (VPSHLDDMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VPSHLDD512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHLDDMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSLLD512 x y) mask) + // result: (VPSLLDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSLLD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSHRDD512 [a] x y) mask) + // result: (VPSHRDDMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VPSHRDD512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHRDDMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSRAD512 x y) mask) + // result: (VPSRADMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRAD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRADMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSRLD512 x y) mask) + // result: (VPSRLDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRLD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSHLDVD512 x y z) mask) + // result: (VPSHLDVDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPSHLDVD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPSHLDVDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSLLVD512 x y) mask) + // result: (VPSLLVDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSLLVD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLVDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSHRDVD512 x y z) mask) + // result: (VPSHRDVDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPSHRDVD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPSHRDVDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSRAVD512 x y) mask) + // result: (VPSRAVDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRAVD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRAVDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSRLVD512 x y) mask) + // result: (VPSRLVDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRLVD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLVDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VSQRTPS512 x) mask) + // result: (VSQRTPSMasked512 x mask) + for { + if v_0.Op != OpAMD64VSQRTPS512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VSQRTPSMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VSUBPS512 x y) mask) + // result: (VSUBPSMasked512 x y mask) + for { + if v_0.Op != OpAMD64VSUBPS512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VSUBPSMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSUBD512 x y) mask) + // result: (VPSUBDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSUBD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSUBDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPXORD512 x y) mask) + // result: (VPXORDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPXORD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPXORDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSLLD512const [a] x) mask) + // result: (VPSLLDMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSLLD512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLDMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSRLD512const [a] x) mask) + // result: (VPSRLDMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSRLD512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLDMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU32Masked512 (VPSRAD512const [a] x) mask) + // result: (VPSRADMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSRAD512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRADMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQU64Masked512 (VPABSQ512 x) mask) + // result: (VPABSQMasked512 x mask) + for { + if v_0.Op != OpAMD64VPABSQ512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPABSQMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VADDPD512 x y) mask) + // result: (VADDPDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VADDPD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VADDPDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPADDQ512 x y) mask) + // result: (VPADDQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPADDQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPADDQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPANDQ512 x y) mask) + // result: (VPANDQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPANDQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPANDQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPANDNQ512 x y) mask) + // result: (VPANDNQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPANDNQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPANDNQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VBROADCASTSD512 x) mask) + // result: (VBROADCASTSDMasked512 x mask) + for { + if v_0.Op != OpAMD64VBROADCASTSD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VBROADCASTSDMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPBROADCASTQ512 x) mask) + // result: (VPBROADCASTQMasked512 x mask) + for { + if v_0.Op != OpAMD64VPBROADCASTQ512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPBROADCASTQMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VRNDSCALEPD512 [a] x) mask) + // result: (VRNDSCALEPDMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VRNDSCALEPD512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VRNDSCALEPDMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) + // result: (VREDUCEPDMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VREDUCEPD512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VDIVPD512 x y) mask) + // result: (VDIVPDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VDIVPD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VDIVPDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VMAXPD512 x y) mask) + // result: (VMAXPDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VMAXPD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VMAXPDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPMAXSQ512 x y) mask) + // result: (VPMAXSQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMAXSQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMAXSQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPMAXUQ512 x y) mask) + // result: (VPMAXUQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMAXUQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMAXUQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VMINPD512 x y) mask) + // result: (VMINPDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VMINPD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VMINPDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPMINSQ512 x y) mask) + // result: (VPMINSQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMINSQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMINSQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPMINUQ512 x y) mask) + // result: (VPMINUQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMINUQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMINUQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VFMADD213PD512 x y z) mask) + // result: (VFMADD213PDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VFMADD213PD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VFMADD213PDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked512 (VFMADDSUB213PD512 x y z) mask) + // result: (VFMADDSUB213PDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VFMADDSUB213PD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VFMADDSUB213PDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked512 (VMULPD512 x y) mask) + // result: (VMULPDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VMULPD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VMULPDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPMULLQ512 x y) mask) + // result: (VPMULLQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMULLQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMULLQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VFMSUBADD213PD512 x y z) mask) + // result: (VFMSUBADD213PDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VFMSUBADD213PD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VFMSUBADD213PDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) + // result: (VPOPCNTQMasked512 x mask) + for { + if v_0.Op != OpAMD64VPOPCNTQ512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPOPCNTQMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPORQ512 x y) mask) + // result: (VPORQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPORQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPORQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) + // result: (VPERMI2PDMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2PD512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2PDMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) + // result: (VPERMI2QMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2Q512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2QMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPERMPD512 x y) mask) + // result: (VPERMPDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPERMPD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPERMPDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPERMQ512 x y) mask) + // result: (VPERMQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPERMQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPERMQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VRCP14PD512 x) mask) + // result: (VRCP14PDMasked512 x mask) + for { + if v_0.Op != OpAMD64VRCP14PD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VRCP14PDMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VRSQRT14PD512 x) mask) + // result: (VRSQRT14PDMasked512 x mask) + for { + if v_0.Op != OpAMD64VRSQRT14PD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VRSQRT14PDMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPROLQ512 [a] x) mask) + // result: (VPROLQMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPROLQ512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPROLQMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPRORQ512 [a] x) mask) + // result: (VPRORQMasked512 [a] x mask) + for { + if v_0.Op != OpAMD64VPRORQ512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPRORQMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPROLVQ512 x y) mask) + // result: (VPROLVQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPROLVQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPROLVQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPRORVQ512 x y) mask) + // result: (VPRORVQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPRORVQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPRORVQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VSCALEFPD512 x y) mask) + // result: (VSCALEFPDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VSCALEFPD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VSCALEFPDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSHLDQ512 [a] x y) mask) + // result: (VPSHLDQMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VPSHLDQ512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHLDQMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSLLQ512 x y) mask) + // result: (VPSLLQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSLLQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSHRDQ512 [a] x y) mask) + // result: (VPSHRDQMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VPSHRDQ512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSHRDQMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSRAQ512 x y) mask) + // result: (VPSRAQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRAQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRAQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSRLQ512 x y) mask) + // result: (VPSRLQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRLQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSHLDVQ512 x y z) mask) + // result: (VPSHLDVQMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPSHLDVQ512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPSHLDVQMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSLLVQ512 x y) mask) + // result: (VPSLLVQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSLLVQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLVQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSHRDVQ512 x y z) mask) + // result: (VPSHRDVQMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPSHRDVQ512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPSHRDVQMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSRAVQ512 x y) mask) + // result: (VPSRAVQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRAVQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRAVQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSRLVQ512 x y) mask) + // result: (VPSRLVQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSRLVQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLVQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VSQRTPD512 x) mask) + // result: (VSQRTPDMasked512 x mask) + for { + if v_0.Op != OpAMD64VSQRTPD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VSQRTPDMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VSUBPD512 x y) mask) + // result: (VSUBPDMasked512 x y mask) + for { + if v_0.Op != OpAMD64VSUBPD512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VSUBPDMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSUBQ512 x y) mask) + // result: (VPSUBQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSUBQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSUBQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPXORQ512 x y) mask) + // result: (VPXORQMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPXORQ512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPXORQMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSLLQ512const [a] x) mask) + // result: (VPSLLQMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSLLQ512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSLLQMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSRLQ512const [a] x) mask) + // result: (VPSRLQMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSRLQ512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRLQMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU64Masked512 (VPSRAQ512const [a] x) mask) + // result: (VPSRAQMasked512const [a] x mask) + for { + if v_0.Op != OpAMD64VPSRAQ512const { + break + } + a := auxIntToUint8(v_0.AuxInt) + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSRAQMasked512const) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VMOVDQU8Masked512 (VPABSB512 x) mask) + // result: (VPABSBMasked512 x mask) + for { + if v_0.Op != OpAMD64VPABSB512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPABSBMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPADDB512 x y) mask) + // result: (VPADDBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPADDB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPADDBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPADDSB512 x y) mask) + // result: (VPADDSBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPADDSB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPADDSBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPADDUSB512 x y) mask) + // result: (VPADDUSBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPADDUSB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPADDUSBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPAVGB512 x y) mask) + // result: (VPAVGBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPAVGB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPAVGBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPBROADCASTB512 x) mask) + // result: (VPBROADCASTBMasked512 x mask) + for { + if v_0.Op != OpAMD64VPBROADCASTB512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPBROADCASTBMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPMOVZXBW512 x) mask) + // result: (VPMOVZXBWMasked512 x mask) + for { + if v_0.Op != OpAMD64VPMOVZXBW512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVZXBWMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU8Masked512 (VGF2P8AFFINEINVQB512 [a] x y) mask) + // result: (VGF2P8AFFINEINVQBMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VGF2P8AFFINEINVQB512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VGF2P8AFFINEINVQBMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VGF2P8AFFINEQB512 [a] x y) mask) + // result: (VGF2P8AFFINEQBMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VGF2P8AFFINEQB512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VGF2P8AFFINEQBMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VGF2P8MULB512 x y) mask) + // result: (VGF2P8MULBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VGF2P8MULB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VGF2P8MULBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPMAXSB512 x y) mask) + // result: (VPMAXSBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMAXSB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMAXSBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPMAXUB512 x y) mask) + // result: (VPMAXUBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMAXUB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMAXUBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPMINSB512 x y) mask) + // result: (VPMINSBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMINSB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMINSBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPMINUB512 x y) mask) + // result: (VPMINUBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPMINUB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMINUBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPOPCNTB512 x) mask) + // result: (VPOPCNTBMasked512 x mask) + for { + if v_0.Op != OpAMD64VPOPCNTB512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPOPCNTBMasked512) + v.AddArg2(x, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) + // result: (VPERMI2BMasked512 x y z mask) + for { + if v_0.Op != OpAMD64VPERMI2B512 { + break + } + z := v_0.Args[2] + x := v_0.Args[0] + y := v_0.Args[1] + mask := v_1 + v.reset(OpAMD64VPERMI2BMasked512) + v.AddArg4(x, y, z, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask) + // result: (VPERMBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPERMB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPERMBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPSUBB512 x y) mask) + // result: (VPSUBBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSUBB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSUBBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPSUBSB512 x y) mask) + // result: (VPSUBSBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSUBSB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSUBSBMasked512) + v.AddArg3(x, y, mask) + return true + } + // match: (VMOVDQU8Masked512 (VPSUBUSB512 x y) mask) + // result: (VPSUBUSBMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPSUBUSB512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPSUBUSBMasked512) + v.AddArg3(x, y, mask) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VPANDQ512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -26170,144 +28384,264 @@ func rewriteValueAMD64_OpAMD64VPMOVVec8x16ToM(v *Value) bool { if v_0.Op != OpAMD64VPMOVMToVec8x16 { break } - x := v_0.Args[0] - v.copyOf(x) + x := v_0.Args[0] + v.copyOf(x) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPMOVVec8x32ToM(v *Value) bool { + v_0 := v.Args[0] + // match: (VPMOVVec8x32ToM (VPMOVMToVec8x32 x)) + // result: x + for { + if v_0.Op != OpAMD64VPMOVMToVec8x32 { + break + } + x := v_0.Args[0] + v.copyOf(x) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPMOVVec8x64ToM(v *Value) bool { + v_0 := v.Args[0] + // match: (VPMOVVec8x64ToM (VPMOVMToVec8x64 x)) + // result: x + for { + if v_0.Op != OpAMD64VPMOVMToVec8x64 { + break + } + x := v_0.Args[0] + v.copyOf(x) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSLLD128(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSLLD128 x (MOVQconst [c])) + // result: (VPSLLD128const [uint8(c)] x) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + v.reset(OpAMD64VPSLLD128const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg(x) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSLLD256(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSLLD256 x (MOVQconst [c])) + // result: (VPSLLD256const [uint8(c)] x) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + v.reset(OpAMD64VPSLLD256const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg(x) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSLLD512(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSLLD512 x (MOVQconst [c])) + // result: (VPSLLD512const [uint8(c)] x) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + v.reset(OpAMD64VPSLLD512const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg(x) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSLLDMasked128(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSLLDMasked128 x (MOVQconst [c]) mask) + // result: (VPSLLDMasked128const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSLLDMasked128const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) return true } return false } -func rewriteValueAMD64_OpAMD64VPMOVVec8x32ToM(v *Value) bool { +func rewriteValueAMD64_OpAMD64VPSLLDMasked256(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] v_0 := v.Args[0] - // match: (VPMOVVec8x32ToM (VPMOVMToVec8x32 x)) - // result: x + // match: (VPSLLDMasked256 x (MOVQconst [c]) mask) + // result: (VPSLLDMasked256const [uint8(c)] x mask) for { - if v_0.Op != OpAMD64VPMOVMToVec8x32 { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { break } - x := v_0.Args[0] - v.copyOf(x) + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSLLDMasked256const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) return true } return false } -func rewriteValueAMD64_OpAMD64VPMOVVec8x64ToM(v *Value) bool { +func rewriteValueAMD64_OpAMD64VPSLLDMasked512(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] v_0 := v.Args[0] - // match: (VPMOVVec8x64ToM (VPMOVMToVec8x64 x)) - // result: x + // match: (VPSLLDMasked512 x (MOVQconst [c]) mask) + // result: (VPSLLDMasked512const [uint8(c)] x mask) for { - if v_0.Op != OpAMD64VPMOVMToVec8x64 { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { break } - x := v_0.Args[0] - v.copyOf(x) + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSLLDMasked512const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) return true } return false } -func rewriteValueAMD64_OpAMD64VPSLLD128(v *Value) bool { +func rewriteValueAMD64_OpAMD64VPSLLQ128(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] - // match: (VPSLLD128 x (MOVQconst [c])) - // result: (VPSLLD128const [uint8(c)] x) + // match: (VPSLLQ128 x (MOVQconst [c])) + // result: (VPSLLQ128const [uint8(c)] x) for { x := v_0 if v_1.Op != OpAMD64MOVQconst { break } c := auxIntToInt64(v_1.AuxInt) - v.reset(OpAMD64VPSLLD128const) + v.reset(OpAMD64VPSLLQ128const) v.AuxInt = uint8ToAuxInt(uint8(c)) v.AddArg(x) return true } return false } -func rewriteValueAMD64_OpAMD64VPSLLD256(v *Value) bool { +func rewriteValueAMD64_OpAMD64VPSLLQ256(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] - // match: (VPSLLD256 x (MOVQconst [c])) - // result: (VPSLLD256const [uint8(c)] x) + // match: (VPSLLQ256 x (MOVQconst [c])) + // result: (VPSLLQ256const [uint8(c)] x) for { x := v_0 if v_1.Op != OpAMD64MOVQconst { break } c := auxIntToInt64(v_1.AuxInt) - v.reset(OpAMD64VPSLLD256const) + v.reset(OpAMD64VPSLLQ256const) v.AuxInt = uint8ToAuxInt(uint8(c)) v.AddArg(x) return true } return false } -func rewriteValueAMD64_OpAMD64VPSLLD512(v *Value) bool { +func rewriteValueAMD64_OpAMD64VPSLLQ512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] - // match: (VPSLLD512 x (MOVQconst [c])) - // result: (VPSLLD512const [uint8(c)] x) + // match: (VPSLLQ512 x (MOVQconst [c])) + // result: (VPSLLQ512const [uint8(c)] x) for { x := v_0 if v_1.Op != OpAMD64MOVQconst { break } c := auxIntToInt64(v_1.AuxInt) - v.reset(OpAMD64VPSLLD512const) + v.reset(OpAMD64VPSLLQ512const) v.AuxInt = uint8ToAuxInt(uint8(c)) v.AddArg(x) return true } return false } -func rewriteValueAMD64_OpAMD64VPSLLQ128(v *Value) bool { +func rewriteValueAMD64_OpAMD64VPSLLQMasked128(v *Value) bool { + v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] - // match: (VPSLLQ128 x (MOVQconst [c])) - // result: (VPSLLQ128const [uint8(c)] x) + // match: (VPSLLQMasked128 x (MOVQconst [c]) mask) + // result: (VPSLLQMasked128const [uint8(c)] x mask) for { x := v_0 if v_1.Op != OpAMD64MOVQconst { break } c := auxIntToInt64(v_1.AuxInt) - v.reset(OpAMD64VPSLLQ128const) + mask := v_2 + v.reset(OpAMD64VPSLLQMasked128const) v.AuxInt = uint8ToAuxInt(uint8(c)) - v.AddArg(x) + v.AddArg2(x, mask) return true } return false } -func rewriteValueAMD64_OpAMD64VPSLLQ256(v *Value) bool { +func rewriteValueAMD64_OpAMD64VPSLLQMasked256(v *Value) bool { + v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] - // match: (VPSLLQ256 x (MOVQconst [c])) - // result: (VPSLLQ256const [uint8(c)] x) + // match: (VPSLLQMasked256 x (MOVQconst [c]) mask) + // result: (VPSLLQMasked256const [uint8(c)] x mask) for { x := v_0 if v_1.Op != OpAMD64MOVQconst { break } c := auxIntToInt64(v_1.AuxInt) - v.reset(OpAMD64VPSLLQ256const) + mask := v_2 + v.reset(OpAMD64VPSLLQMasked256const) v.AuxInt = uint8ToAuxInt(uint8(c)) - v.AddArg(x) + v.AddArg2(x, mask) return true } return false } -func rewriteValueAMD64_OpAMD64VPSLLQ512(v *Value) bool { +func rewriteValueAMD64_OpAMD64VPSLLQMasked512(v *Value) bool { + v_2 := v.Args[2] v_1 := v.Args[1] v_0 := v.Args[0] - // match: (VPSLLQ512 x (MOVQconst [c])) - // result: (VPSLLQ512const [uint8(c)] x) + // match: (VPSLLQMasked512 x (MOVQconst [c]) mask) + // result: (VPSLLQMasked512const [uint8(c)] x mask) for { x := v_0 if v_1.Op != OpAMD64MOVQconst { break } c := auxIntToInt64(v_1.AuxInt) - v.reset(OpAMD64VPSLLQ512const) + mask := v_2 + v.reset(OpAMD64VPSLLQMasked512const) v.AuxInt = uint8ToAuxInt(uint8(c)) - v.AddArg(x) + v.AddArg2(x, mask) return true } return false @@ -26366,6 +28700,66 @@ func rewriteValueAMD64_OpAMD64VPSLLW512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VPSLLWMasked128(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSLLWMasked128 x (MOVQconst [c]) mask) + // result: (VPSLLWMasked128const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSLLWMasked128const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSLLWMasked256(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSLLWMasked256 x (MOVQconst [c]) mask) + // result: (VPSLLWMasked256const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSLLWMasked256const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSLLWMasked512(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSLLWMasked512 x (MOVQconst [c]) mask) + // result: (VPSLLWMasked512const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSLLWMasked512const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VPSRAD128(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -26420,6 +28814,66 @@ func rewriteValueAMD64_OpAMD64VPSRAD512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VPSRADMasked128(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRADMasked128 x (MOVQconst [c]) mask) + // result: (VPSRADMasked128const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRADMasked128const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSRADMasked256(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRADMasked256 x (MOVQconst [c]) mask) + // result: (VPSRADMasked256const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRADMasked256const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSRADMasked512(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRADMasked512 x (MOVQconst [c]) mask) + // result: (VPSRADMasked512const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRADMasked512const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VPSRAQ128(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -26474,6 +28928,66 @@ func rewriteValueAMD64_OpAMD64VPSRAQ512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VPSRAQMasked128(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRAQMasked128 x (MOVQconst [c]) mask) + // result: (VPSRAQMasked128const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRAQMasked128const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSRAQMasked256(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRAQMasked256 x (MOVQconst [c]) mask) + // result: (VPSRAQMasked256const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRAQMasked256const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSRAQMasked512(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRAQMasked512 x (MOVQconst [c]) mask) + // result: (VPSRAQMasked512const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRAQMasked512const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VPSRAW128(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -26528,6 +29042,66 @@ func rewriteValueAMD64_OpAMD64VPSRAW512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VPSRAWMasked128(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRAWMasked128 x (MOVQconst [c]) mask) + // result: (VPSRAWMasked128const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRAWMasked128const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSRAWMasked256(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRAWMasked256 x (MOVQconst [c]) mask) + // result: (VPSRAWMasked256const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRAWMasked256const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPSRAWMasked512(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPSRAWMasked512 x (MOVQconst [c]) mask) + // result: (VPSRAWMasked512const [uint8(c)] x mask) + for { + x := v_0 + if v_1.Op != OpAMD64MOVQconst { + break + } + c := auxIntToInt64(v_1.AuxInt) + mask := v_2 + v.reset(OpAMD64VPSRAWMasked512const) + v.AuxInt = uint8ToAuxInt(uint8(c)) + v.AddArg2(x, mask) + return true + } + return false +} func rewriteValueAMD64_OpAMD64XADDLlock(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] diff --git a/src/simd/_gen/simdgen/gen_simdrules.go b/src/simd/_gen/simdgen/gen_simdrules.go index b0fc7e62cd..8c31411113 100644 --- a/src/simd/_gen/simdgen/gen_simdrules.go +++ b/src/simd/_gen/simdgen/gen_simdrules.go @@ -8,6 +8,7 @@ import ( "bytes" "fmt" "slices" + "strings" "text/template" ) @@ -20,6 +21,7 @@ type tplRuleData struct { ArgsOut string // e.g. "x y" MaskInConvert string // e.g. "VPMOVVec32x8ToM" MaskOutConvert string // e.g. "VPMOVMToVec32x8" + ElementSize int // e.g. 32 } var ( @@ -39,6 +41,42 @@ var ( `)) ) +func (d tplRuleData) MaskOptimization() string { + asmNoMask := d.Asm + if i := strings.Index(asmNoMask, "Masked"); i == -1 { + return "" + } + asmNoMask = strings.ReplaceAll(asmNoMask, "Masked", "") + + for _, nope := range []string{"VMOVDQU", "VPCOMPRESS", "VCOMPRESS", "VPEXPAND", "VEXPAND", "VPBLENDM", "VMOVUP"} { + if strings.HasPrefix(asmNoMask, nope) { + return "" + } + } + + size := asmNoMask[len(asmNoMask)-3:] + if strings.HasSuffix(asmNoMask, "const") { + sufLen := len("128const") + size = asmNoMask[len(asmNoMask)-sufLen:][:3] + } + switch size { + case "128", "256": + // TODO don't handle these yet because they will require a feature guard check in rewrite + return "" + case "512": + default: + panic("Unexpected operation size on " + d.Asm) + } + + switch d.ElementSize { + case 8, 16, 32, 64: + default: + panic(fmt.Errorf("Unexpected operation width %d on %v", d.ElementSize, d.Asm)) + } + + return fmt.Sprintf("(VMOVDQU%dMasked512 (%s %s) mask) => (%s %s mask)\n", d.ElementSize, asmNoMask, d.Args, d.Asm, d.Args) +} + // SSA rewrite rules need to appear in a most-to-least-specific order. This works for that. var tmplOrder = map[string]int{ "masksftimm": 0, @@ -80,11 +118,9 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer { buffer.WriteString(generatedHeader + "\n") var allData []tplRuleData + var optData []tplRuleData // for peephole optimizations for _, opr := range ops { - if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" { - continue - } opInShape, opOutShape, maskType, immType, gOp := opr.shape() asm := machineOpName(maskType, gOp) vregInCnt := len(gOp.In) @@ -146,7 +182,9 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer { data.GoType = goType(gOp) rearIdx := len(gOp.In) - 1 // Mask is at the end. - data.MaskInConvert = fmt.Sprintf("VPMOVVec%dx%dToM", *gOp.In[rearIdx].ElemBits, *gOp.In[rearIdx].Lanes) + width := *gOp.In[rearIdx].ElemBits + data.MaskInConvert = fmt.Sprintf("VPMOVVec%dx%dToM", width, *gOp.In[rearIdx].Lanes) + data.ElementSize = width case PureKmaskIn: panic(fmt.Errorf("simdgen does not support pure k mask instructions, they should be generated by compiler optimizations")) } @@ -196,6 +234,10 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer { data.ArgsOut = "..." } data.tplName = tplName + if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" { + optData = append(optData, data) + continue + } allData = append(allData, data) } @@ -207,5 +249,18 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer { } } + seen := make(map[string]bool) + + for _, data := range optData { + if data.tplName == "maskIn" { + rule := data.MaskOptimization() + if seen[rule] { + continue + } + seen[rule] = true + buffer.WriteString(rule) + } + } + return buffer } diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 8f6142203e..38065cb841 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -445,3 +445,36 @@ func TestBroadcastFloat32x8(t *testing.T) { simd.BroadcastFloat32x8(123456789).StoreSlice(s) checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789}) } + +func TestBroadcastFloat64x2(t *testing.T) { + s := make([]float64, 2, 2) + simd.BroadcastFloat64x2(123456789).StoreSlice(s) + checkSlices(t, s, []float64{123456789, 123456789}) +} + +func TestBroadcastUint64x2(t *testing.T) { + s := make([]uint64, 2, 2) + simd.BroadcastUint64x2(123456789).StoreSlice(s) + checkSlices(t, s, []uint64{123456789, 123456789}) +} + +func TestMaskOpt512(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + + k := make([]int64, 8, 8) + s := make([]float64, 8, 8) + + a := simd.LoadFloat64x8Slice([]float64{2, 0, 2, 0, 2, 0, 2, 0}) + b := simd.LoadFloat64x8Slice([]float64{1, 1, 1, 1, 1, 1, 1, 1}) + c := simd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8}) + d := simd.LoadFloat64x8Slice([]float64{2, 4, 6, 8, 10, 12, 14, 16}) + g := a.Greater(b) + e := c.Add(d).Masked(g) + e.StoreSlice(s) + g.AsInt64x8().StoreSlice(k) + checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0}) + checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0}) +} -- 2.52.0