]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile, simd/_gen: add rewrite for const load ops
authorJunyang Shao <shaojunyang@google.com>
Wed, 17 Sep 2025 14:44:49 +0000 (14:44 +0000)
committerJunyang Shao <shaojunyang@google.com>
Thu, 18 Sep 2025 18:06:52 +0000 (11:06 -0700)
This CL adds rewrite rules for ops with const imm8 that takes a load to
its memory form.

Change-Id: I74d0df48715ab48b88b04c8e1bfb3c6b8e528aeb
Reviewed-on: https://go-review.googlesource.com/c/go/+/704635
TryBot-Bypass: Junyang Shao <shaojunyang@google.com>
Reviewed-by: David Chase <drchase@google.com>
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/simd/_gen/simdgen/gen_simdrules.go

index 65f47eb369bcc001223a2366da26e4ec15558f48..b6a7394a73aa8ad5d6142b7ccd14178a5f208abb 100644 (file)
 (VPANDNQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked128load {sym} [off] x ptr mask mem)
 (VPANDNQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked256load {sym} [off] x ptr mask mem)
 (VPANDNQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPANDNQMasked512load {sym} [off] x ptr mask mem)
+(VRNDSCALEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VRNDSCALEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VRNDSCALEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VRNDSCALEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VRNDSCALEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VRNDSCALEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VRNDSCALEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VRNDSCALEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VRNDSCALEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VRNDSCALEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRNDSCALEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VREDUCEPS128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VREDUCEPS256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VREDUCEPS512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VREDUCEPD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VREDUCEPD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VREDUCEPD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VREDUCEPSMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VREDUCEPSMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VREDUCEPSMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
 (VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
 (VDIVPDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VDIVPDMasked512load {sym} [off] x ptr mask mem)
 (VPCMPEQD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPEQD512load {sym} [off] x ptr mem)
 (VPCMPEQQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPEQQ512load {sym} [off] x ptr mem)
+(VCMPPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+(VCMPPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VCMPPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+(VCMPPSMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VCMPPSMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VCMPPSMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VCMPPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VCMPPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VCMPPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VCMPPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPUDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VPCMPUQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+(VGF2P8AFFINEQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQB512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQB512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQB512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VGF2P8AFFINEINVQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEINVQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEINVQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEINVQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VGF2P8AFFINEQBMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VGF2P8AFFINEQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
 (VPCMPGTD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPGTD512load {sym} [off] x ptr mem)
 (VPCMPGTQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPGTQ512load {sym} [off] x ptr mem)
+(VPCMPUD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+(VPCMPUQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPUQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+(VPCMPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+(VPCMPQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPCMPQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
 (VPUNPCKHDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKHDQ512load {sym} [off] x ptr mem)
 (VPUNPCKHQDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKHQDQ512load {sym} [off] x ptr mem)
 (VPUNPCKLDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPUNPCKLDQ512load {sym} [off] x ptr mem)
 (VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
 (VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
+(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem)
 (VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem)
 (VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem)
 (VRSQRT14PDMasked128 l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked128load {sym} [off] ptr mask mem)
 (VRSQRT14PDMasked256 l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked256load {sym} [off] ptr mask mem)
 (VRSQRT14PDMasked512 l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VRSQRT14PDMasked512load {sym} [off] ptr mask mem)
+(VPROLD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPROLD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPROLD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPROLQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPROLQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPROLQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPROLDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPROLDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPROLDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPROLQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPROLQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPROLQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPROLQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPRORD128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPRORD256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPRORD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPRORQ128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPRORQ256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPRORQ512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPRORQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPRORDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPRORDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPRORDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPRORQMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPRORQMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPRORQMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPRORQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPROLVD128 x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD128load {sym} [off] x ptr mem)
 (VPROLVD256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD256load {sym} [off] x ptr mem)
 (VPROLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPROLVD512load {sym} [off] x ptr mem)
 (VSCALEFPDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked128load {sym} [off] x ptr mask mem)
 (VSCALEFPDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked256load {sym} [off] x ptr mask mem)
 (VSCALEFPDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VSCALEFPDMasked512load {sym} [off] x ptr mask mem)
+(VPSHLDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHLDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHLDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHLDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHLDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHLDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHLDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHLDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHLDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHLDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHLDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHRDD128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHRDD256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHRDD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHRDQ128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHRDQ256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHRDQ512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQ512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHRDDMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHRDDMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHRDDMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked128 [c]  x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked256 [c]  x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
+(VPSHRDQMasked512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHRDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mask mem)
 (VPSLLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLVD512load {sym} [off] x ptr mem)
 (VPSLLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLVQ512load {sym} [off] x ptr mem)
 (VPSHLDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHLDVD128load {sym} [off] x y ptr mem)
 (VPXORQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPXORQMasked512load {sym} [off] x ptr mask mem)
 (VPBLENDMDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMDMasked512load {sym} [off] x ptr mask mem)
 (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
+(VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSLLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSLLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSLLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSLLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSLLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSRLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSRAD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSRAQ128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSRAQ256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSRAQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSRAQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSRLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRLDMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRLDMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRLQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRLQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRLQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRADMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRADMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRADMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRADMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRAQMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRAQMasked256const [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSRAQMasked512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
index 471fa0c201e7ff0fc3662753ccd309cea8cc302a..c0f5b4086a61a66c05491e8054efb589a4bf43f3 100644 (file)
@@ -523,6 +523,22 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VADDPSMasked256(v)
        case OpAMD64VADDPSMasked512:
                return rewriteValueAMD64_OpAMD64VADDPSMasked512(v)
+       case OpAMD64VCMPPD512:
+               return rewriteValueAMD64_OpAMD64VCMPPD512(v)
+       case OpAMD64VCMPPDMasked128:
+               return rewriteValueAMD64_OpAMD64VCMPPDMasked128(v)
+       case OpAMD64VCMPPDMasked256:
+               return rewriteValueAMD64_OpAMD64VCMPPDMasked256(v)
+       case OpAMD64VCMPPDMasked512:
+               return rewriteValueAMD64_OpAMD64VCMPPDMasked512(v)
+       case OpAMD64VCMPPS512:
+               return rewriteValueAMD64_OpAMD64VCMPPS512(v)
+       case OpAMD64VCMPPSMasked128:
+               return rewriteValueAMD64_OpAMD64VCMPPSMasked128(v)
+       case OpAMD64VCMPPSMasked256:
+               return rewriteValueAMD64_OpAMD64VCMPPSMasked256(v)
+       case OpAMD64VCMPPSMasked512:
+               return rewriteValueAMD64_OpAMD64VCMPPSMasked512(v)
        case OpAMD64VCVTPS2UDQ128:
                return rewriteValueAMD64_OpAMD64VCVTPS2UDQ128(v)
        case OpAMD64VCVTPS2UDQ256:
@@ -631,6 +647,30 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VFMSUBADD213PSMasked256(v)
        case OpAMD64VFMSUBADD213PSMasked512:
                return rewriteValueAMD64_OpAMD64VFMSUBADD213PSMasked512(v)
+       case OpAMD64VGF2P8AFFINEINVQB128:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB128(v)
+       case OpAMD64VGF2P8AFFINEINVQB256:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB256(v)
+       case OpAMD64VGF2P8AFFINEINVQB512:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB512(v)
+       case OpAMD64VGF2P8AFFINEINVQBMasked128:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked128(v)
+       case OpAMD64VGF2P8AFFINEINVQBMasked256:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked256(v)
+       case OpAMD64VGF2P8AFFINEINVQBMasked512:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked512(v)
+       case OpAMD64VGF2P8AFFINEQB128:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB128(v)
+       case OpAMD64VGF2P8AFFINEQB256:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB256(v)
+       case OpAMD64VGF2P8AFFINEQB512:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB512(v)
+       case OpAMD64VGF2P8AFFINEQBMasked128:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked128(v)
+       case OpAMD64VGF2P8AFFINEQBMasked256:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked256(v)
+       case OpAMD64VGF2P8AFFINEQBMasked512:
+               return rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked512(v)
        case OpAMD64VMAXPD512:
                return rewriteValueAMD64_OpAMD64VMAXPD512(v)
        case OpAMD64VMAXPDMasked128:
@@ -807,6 +847,14 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPBROADCASTW256(v)
        case OpAMD64VPBROADCASTW512:
                return rewriteValueAMD64_OpAMD64VPBROADCASTW512(v)
+       case OpAMD64VPCMPD512:
+               return rewriteValueAMD64_OpAMD64VPCMPD512(v)
+       case OpAMD64VPCMPDMasked128:
+               return rewriteValueAMD64_OpAMD64VPCMPDMasked128(v)
+       case OpAMD64VPCMPDMasked256:
+               return rewriteValueAMD64_OpAMD64VPCMPDMasked256(v)
+       case OpAMD64VPCMPDMasked512:
+               return rewriteValueAMD64_OpAMD64VPCMPDMasked512(v)
        case OpAMD64VPCMPEQD512:
                return rewriteValueAMD64_OpAMD64VPCMPEQD512(v)
        case OpAMD64VPCMPEQQ512:
@@ -815,6 +863,30 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPCMPGTD512(v)
        case OpAMD64VPCMPGTQ512:
                return rewriteValueAMD64_OpAMD64VPCMPGTQ512(v)
+       case OpAMD64VPCMPQ512:
+               return rewriteValueAMD64_OpAMD64VPCMPQ512(v)
+       case OpAMD64VPCMPQMasked128:
+               return rewriteValueAMD64_OpAMD64VPCMPQMasked128(v)
+       case OpAMD64VPCMPQMasked256:
+               return rewriteValueAMD64_OpAMD64VPCMPQMasked256(v)
+       case OpAMD64VPCMPQMasked512:
+               return rewriteValueAMD64_OpAMD64VPCMPQMasked512(v)
+       case OpAMD64VPCMPUD512:
+               return rewriteValueAMD64_OpAMD64VPCMPUD512(v)
+       case OpAMD64VPCMPUDMasked128:
+               return rewriteValueAMD64_OpAMD64VPCMPUDMasked128(v)
+       case OpAMD64VPCMPUDMasked256:
+               return rewriteValueAMD64_OpAMD64VPCMPUDMasked256(v)
+       case OpAMD64VPCMPUDMasked512:
+               return rewriteValueAMD64_OpAMD64VPCMPUDMasked512(v)
+       case OpAMD64VPCMPUQ512:
+               return rewriteValueAMD64_OpAMD64VPCMPUQ512(v)
+       case OpAMD64VPCMPUQMasked128:
+               return rewriteValueAMD64_OpAMD64VPCMPUQMasked128(v)
+       case OpAMD64VPCMPUQMasked256:
+               return rewriteValueAMD64_OpAMD64VPCMPUQMasked256(v)
+       case OpAMD64VPCMPUQMasked512:
+               return rewriteValueAMD64_OpAMD64VPCMPUQMasked512(v)
        case OpAMD64VPDPBUSD512:
                return rewriteValueAMD64_OpAMD64VPDPBUSD512(v)
        case OpAMD64VPDPBUSDMasked128:
@@ -1115,6 +1187,30 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPORQMasked256(v)
        case OpAMD64VPORQMasked512:
                return rewriteValueAMD64_OpAMD64VPORQMasked512(v)
+       case OpAMD64VPROLD128:
+               return rewriteValueAMD64_OpAMD64VPROLD128(v)
+       case OpAMD64VPROLD256:
+               return rewriteValueAMD64_OpAMD64VPROLD256(v)
+       case OpAMD64VPROLD512:
+               return rewriteValueAMD64_OpAMD64VPROLD512(v)
+       case OpAMD64VPROLDMasked128:
+               return rewriteValueAMD64_OpAMD64VPROLDMasked128(v)
+       case OpAMD64VPROLDMasked256:
+               return rewriteValueAMD64_OpAMD64VPROLDMasked256(v)
+       case OpAMD64VPROLDMasked512:
+               return rewriteValueAMD64_OpAMD64VPROLDMasked512(v)
+       case OpAMD64VPROLQ128:
+               return rewriteValueAMD64_OpAMD64VPROLQ128(v)
+       case OpAMD64VPROLQ256:
+               return rewriteValueAMD64_OpAMD64VPROLQ256(v)
+       case OpAMD64VPROLQ512:
+               return rewriteValueAMD64_OpAMD64VPROLQ512(v)
+       case OpAMD64VPROLQMasked128:
+               return rewriteValueAMD64_OpAMD64VPROLQMasked128(v)
+       case OpAMD64VPROLQMasked256:
+               return rewriteValueAMD64_OpAMD64VPROLQMasked256(v)
+       case OpAMD64VPROLQMasked512:
+               return rewriteValueAMD64_OpAMD64VPROLQMasked512(v)
        case OpAMD64VPROLVD128:
                return rewriteValueAMD64_OpAMD64VPROLVD128(v)
        case OpAMD64VPROLVD256:
@@ -1139,6 +1235,30 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPROLVQMasked256(v)
        case OpAMD64VPROLVQMasked512:
                return rewriteValueAMD64_OpAMD64VPROLVQMasked512(v)
+       case OpAMD64VPRORD128:
+               return rewriteValueAMD64_OpAMD64VPRORD128(v)
+       case OpAMD64VPRORD256:
+               return rewriteValueAMD64_OpAMD64VPRORD256(v)
+       case OpAMD64VPRORD512:
+               return rewriteValueAMD64_OpAMD64VPRORD512(v)
+       case OpAMD64VPRORDMasked128:
+               return rewriteValueAMD64_OpAMD64VPRORDMasked128(v)
+       case OpAMD64VPRORDMasked256:
+               return rewriteValueAMD64_OpAMD64VPRORDMasked256(v)
+       case OpAMD64VPRORDMasked512:
+               return rewriteValueAMD64_OpAMD64VPRORDMasked512(v)
+       case OpAMD64VPRORQ128:
+               return rewriteValueAMD64_OpAMD64VPRORQ128(v)
+       case OpAMD64VPRORQ256:
+               return rewriteValueAMD64_OpAMD64VPRORQ256(v)
+       case OpAMD64VPRORQ512:
+               return rewriteValueAMD64_OpAMD64VPRORQ512(v)
+       case OpAMD64VPRORQMasked128:
+               return rewriteValueAMD64_OpAMD64VPRORQMasked128(v)
+       case OpAMD64VPRORQMasked256:
+               return rewriteValueAMD64_OpAMD64VPRORQMasked256(v)
+       case OpAMD64VPRORQMasked512:
+               return rewriteValueAMD64_OpAMD64VPRORQMasked512(v)
        case OpAMD64VPRORVD128:
                return rewriteValueAMD64_OpAMD64VPRORVD128(v)
        case OpAMD64VPRORVD256:
@@ -1163,6 +1283,30 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPRORVQMasked256(v)
        case OpAMD64VPRORVQMasked512:
                return rewriteValueAMD64_OpAMD64VPRORVQMasked512(v)
+       case OpAMD64VPSHLDD128:
+               return rewriteValueAMD64_OpAMD64VPSHLDD128(v)
+       case OpAMD64VPSHLDD256:
+               return rewriteValueAMD64_OpAMD64VPSHLDD256(v)
+       case OpAMD64VPSHLDD512:
+               return rewriteValueAMD64_OpAMD64VPSHLDD512(v)
+       case OpAMD64VPSHLDDMasked128:
+               return rewriteValueAMD64_OpAMD64VPSHLDDMasked128(v)
+       case OpAMD64VPSHLDDMasked256:
+               return rewriteValueAMD64_OpAMD64VPSHLDDMasked256(v)
+       case OpAMD64VPSHLDDMasked512:
+               return rewriteValueAMD64_OpAMD64VPSHLDDMasked512(v)
+       case OpAMD64VPSHLDQ128:
+               return rewriteValueAMD64_OpAMD64VPSHLDQ128(v)
+       case OpAMD64VPSHLDQ256:
+               return rewriteValueAMD64_OpAMD64VPSHLDQ256(v)
+       case OpAMD64VPSHLDQ512:
+               return rewriteValueAMD64_OpAMD64VPSHLDQ512(v)
+       case OpAMD64VPSHLDQMasked128:
+               return rewriteValueAMD64_OpAMD64VPSHLDQMasked128(v)
+       case OpAMD64VPSHLDQMasked256:
+               return rewriteValueAMD64_OpAMD64VPSHLDQMasked256(v)
+       case OpAMD64VPSHLDQMasked512:
+               return rewriteValueAMD64_OpAMD64VPSHLDQMasked512(v)
        case OpAMD64VPSHLDVD128:
                return rewriteValueAMD64_OpAMD64VPSHLDVD128(v)
        case OpAMD64VPSHLDVD256:
@@ -1187,6 +1331,30 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPSHLDVQMasked256(v)
        case OpAMD64VPSHLDVQMasked512:
                return rewriteValueAMD64_OpAMD64VPSHLDVQMasked512(v)
+       case OpAMD64VPSHRDD128:
+               return rewriteValueAMD64_OpAMD64VPSHRDD128(v)
+       case OpAMD64VPSHRDD256:
+               return rewriteValueAMD64_OpAMD64VPSHRDD256(v)
+       case OpAMD64VPSHRDD512:
+               return rewriteValueAMD64_OpAMD64VPSHRDD512(v)
+       case OpAMD64VPSHRDDMasked128:
+               return rewriteValueAMD64_OpAMD64VPSHRDDMasked128(v)
+       case OpAMD64VPSHRDDMasked256:
+               return rewriteValueAMD64_OpAMD64VPSHRDDMasked256(v)
+       case OpAMD64VPSHRDDMasked512:
+               return rewriteValueAMD64_OpAMD64VPSHRDDMasked512(v)
+       case OpAMD64VPSHRDQ128:
+               return rewriteValueAMD64_OpAMD64VPSHRDQ128(v)
+       case OpAMD64VPSHRDQ256:
+               return rewriteValueAMD64_OpAMD64VPSHRDQ256(v)
+       case OpAMD64VPSHRDQ512:
+               return rewriteValueAMD64_OpAMD64VPSHRDQ512(v)
+       case OpAMD64VPSHRDQMasked128:
+               return rewriteValueAMD64_OpAMD64VPSHRDQMasked128(v)
+       case OpAMD64VPSHRDQMasked256:
+               return rewriteValueAMD64_OpAMD64VPSHRDQMasked256(v)
+       case OpAMD64VPSHRDQMasked512:
+               return rewriteValueAMD64_OpAMD64VPSHRDQMasked512(v)
        case OpAMD64VPSHRDVD128:
                return rewriteValueAMD64_OpAMD64VPSHRDVD128(v)
        case OpAMD64VPSHRDVD256:
@@ -1211,30 +1379,54 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPSHRDVQMasked256(v)
        case OpAMD64VPSHRDVQMasked512:
                return rewriteValueAMD64_OpAMD64VPSHRDVQMasked512(v)
+       case OpAMD64VPSHUFD512:
+               return rewriteValueAMD64_OpAMD64VPSHUFD512(v)
+       case OpAMD64VPSHUFDMasked128:
+               return rewriteValueAMD64_OpAMD64VPSHUFDMasked128(v)
+       case OpAMD64VPSHUFDMasked256:
+               return rewriteValueAMD64_OpAMD64VPSHUFDMasked256(v)
+       case OpAMD64VPSHUFDMasked512:
+               return rewriteValueAMD64_OpAMD64VPSHUFDMasked512(v)
        case OpAMD64VPSLLD128:
                return rewriteValueAMD64_OpAMD64VPSLLD128(v)
        case OpAMD64VPSLLD256:
                return rewriteValueAMD64_OpAMD64VPSLLD256(v)
        case OpAMD64VPSLLD512:
                return rewriteValueAMD64_OpAMD64VPSLLD512(v)
+       case OpAMD64VPSLLD512const:
+               return rewriteValueAMD64_OpAMD64VPSLLD512const(v)
        case OpAMD64VPSLLDMasked128:
                return rewriteValueAMD64_OpAMD64VPSLLDMasked128(v)
+       case OpAMD64VPSLLDMasked128const:
+               return rewriteValueAMD64_OpAMD64VPSLLDMasked128const(v)
        case OpAMD64VPSLLDMasked256:
                return rewriteValueAMD64_OpAMD64VPSLLDMasked256(v)
+       case OpAMD64VPSLLDMasked256const:
+               return rewriteValueAMD64_OpAMD64VPSLLDMasked256const(v)
        case OpAMD64VPSLLDMasked512:
                return rewriteValueAMD64_OpAMD64VPSLLDMasked512(v)
+       case OpAMD64VPSLLDMasked512const:
+               return rewriteValueAMD64_OpAMD64VPSLLDMasked512const(v)
        case OpAMD64VPSLLQ128:
                return rewriteValueAMD64_OpAMD64VPSLLQ128(v)
        case OpAMD64VPSLLQ256:
                return rewriteValueAMD64_OpAMD64VPSLLQ256(v)
        case OpAMD64VPSLLQ512:
                return rewriteValueAMD64_OpAMD64VPSLLQ512(v)
+       case OpAMD64VPSLLQ512const:
+               return rewriteValueAMD64_OpAMD64VPSLLQ512const(v)
        case OpAMD64VPSLLQMasked128:
                return rewriteValueAMD64_OpAMD64VPSLLQMasked128(v)
+       case OpAMD64VPSLLQMasked128const:
+               return rewriteValueAMD64_OpAMD64VPSLLQMasked128const(v)
        case OpAMD64VPSLLQMasked256:
                return rewriteValueAMD64_OpAMD64VPSLLQMasked256(v)
+       case OpAMD64VPSLLQMasked256const:
+               return rewriteValueAMD64_OpAMD64VPSLLQMasked256const(v)
        case OpAMD64VPSLLQMasked512:
                return rewriteValueAMD64_OpAMD64VPSLLQMasked512(v)
+       case OpAMD64VPSLLQMasked512const:
+               return rewriteValueAMD64_OpAMD64VPSLLQMasked512const(v)
        case OpAMD64VPSLLVD512:
                return rewriteValueAMD64_OpAMD64VPSLLVD512(v)
        case OpAMD64VPSLLVDMasked128:
@@ -1269,24 +1461,44 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPSRAD256(v)
        case OpAMD64VPSRAD512:
                return rewriteValueAMD64_OpAMD64VPSRAD512(v)
+       case OpAMD64VPSRAD512const:
+               return rewriteValueAMD64_OpAMD64VPSRAD512const(v)
        case OpAMD64VPSRADMasked128:
                return rewriteValueAMD64_OpAMD64VPSRADMasked128(v)
+       case OpAMD64VPSRADMasked128const:
+               return rewriteValueAMD64_OpAMD64VPSRADMasked128const(v)
        case OpAMD64VPSRADMasked256:
                return rewriteValueAMD64_OpAMD64VPSRADMasked256(v)
+       case OpAMD64VPSRADMasked256const:
+               return rewriteValueAMD64_OpAMD64VPSRADMasked256const(v)
        case OpAMD64VPSRADMasked512:
                return rewriteValueAMD64_OpAMD64VPSRADMasked512(v)
+       case OpAMD64VPSRADMasked512const:
+               return rewriteValueAMD64_OpAMD64VPSRADMasked512const(v)
        case OpAMD64VPSRAQ128:
                return rewriteValueAMD64_OpAMD64VPSRAQ128(v)
+       case OpAMD64VPSRAQ128const:
+               return rewriteValueAMD64_OpAMD64VPSRAQ128const(v)
        case OpAMD64VPSRAQ256:
                return rewriteValueAMD64_OpAMD64VPSRAQ256(v)
+       case OpAMD64VPSRAQ256const:
+               return rewriteValueAMD64_OpAMD64VPSRAQ256const(v)
        case OpAMD64VPSRAQ512:
                return rewriteValueAMD64_OpAMD64VPSRAQ512(v)
+       case OpAMD64VPSRAQ512const:
+               return rewriteValueAMD64_OpAMD64VPSRAQ512const(v)
        case OpAMD64VPSRAQMasked128:
                return rewriteValueAMD64_OpAMD64VPSRAQMasked128(v)
+       case OpAMD64VPSRAQMasked128const:
+               return rewriteValueAMD64_OpAMD64VPSRAQMasked128const(v)
        case OpAMD64VPSRAQMasked256:
                return rewriteValueAMD64_OpAMD64VPSRAQMasked256(v)
+       case OpAMD64VPSRAQMasked256const:
+               return rewriteValueAMD64_OpAMD64VPSRAQMasked256const(v)
        case OpAMD64VPSRAQMasked512:
                return rewriteValueAMD64_OpAMD64VPSRAQMasked512(v)
+       case OpAMD64VPSRAQMasked512const:
+               return rewriteValueAMD64_OpAMD64VPSRAQMasked512const(v)
        case OpAMD64VPSRAVD512:
                return rewriteValueAMD64_OpAMD64VPSRAVD512(v)
        case OpAMD64VPSRAVDMasked128:
@@ -1319,6 +1531,22 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VPSRAWMasked256(v)
        case OpAMD64VPSRAWMasked512:
                return rewriteValueAMD64_OpAMD64VPSRAWMasked512(v)
+       case OpAMD64VPSRLD512const:
+               return rewriteValueAMD64_OpAMD64VPSRLD512const(v)
+       case OpAMD64VPSRLDMasked128const:
+               return rewriteValueAMD64_OpAMD64VPSRLDMasked128const(v)
+       case OpAMD64VPSRLDMasked256const:
+               return rewriteValueAMD64_OpAMD64VPSRLDMasked256const(v)
+       case OpAMD64VPSRLDMasked512const:
+               return rewriteValueAMD64_OpAMD64VPSRLDMasked512const(v)
+       case OpAMD64VPSRLQ512const:
+               return rewriteValueAMD64_OpAMD64VPSRLQ512const(v)
+       case OpAMD64VPSRLQMasked128const:
+               return rewriteValueAMD64_OpAMD64VPSRLQMasked128const(v)
+       case OpAMD64VPSRLQMasked256const:
+               return rewriteValueAMD64_OpAMD64VPSRLQMasked256const(v)
+       case OpAMD64VPSRLQMasked512const:
+               return rewriteValueAMD64_OpAMD64VPSRLQMasked512const(v)
        case OpAMD64VPSRLVD512:
                return rewriteValueAMD64_OpAMD64VPSRLVD512(v)
        case OpAMD64VPSRLVDMasked128:
@@ -1395,6 +1623,54 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpAMD64VRCP14PSMasked256(v)
        case OpAMD64VRCP14PSMasked512:
                return rewriteValueAMD64_OpAMD64VRCP14PSMasked512(v)
+       case OpAMD64VREDUCEPD128:
+               return rewriteValueAMD64_OpAMD64VREDUCEPD128(v)
+       case OpAMD64VREDUCEPD256:
+               return rewriteValueAMD64_OpAMD64VREDUCEPD256(v)
+       case OpAMD64VREDUCEPD512:
+               return rewriteValueAMD64_OpAMD64VREDUCEPD512(v)
+       case OpAMD64VREDUCEPDMasked128:
+               return rewriteValueAMD64_OpAMD64VREDUCEPDMasked128(v)
+       case OpAMD64VREDUCEPDMasked256:
+               return rewriteValueAMD64_OpAMD64VREDUCEPDMasked256(v)
+       case OpAMD64VREDUCEPDMasked512:
+               return rewriteValueAMD64_OpAMD64VREDUCEPDMasked512(v)
+       case OpAMD64VREDUCEPS128:
+               return rewriteValueAMD64_OpAMD64VREDUCEPS128(v)
+       case OpAMD64VREDUCEPS256:
+               return rewriteValueAMD64_OpAMD64VREDUCEPS256(v)
+       case OpAMD64VREDUCEPS512:
+               return rewriteValueAMD64_OpAMD64VREDUCEPS512(v)
+       case OpAMD64VREDUCEPSMasked128:
+               return rewriteValueAMD64_OpAMD64VREDUCEPSMasked128(v)
+       case OpAMD64VREDUCEPSMasked256:
+               return rewriteValueAMD64_OpAMD64VREDUCEPSMasked256(v)
+       case OpAMD64VREDUCEPSMasked512:
+               return rewriteValueAMD64_OpAMD64VREDUCEPSMasked512(v)
+       case OpAMD64VRNDSCALEPD128:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPD128(v)
+       case OpAMD64VRNDSCALEPD256:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPD256(v)
+       case OpAMD64VRNDSCALEPD512:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPD512(v)
+       case OpAMD64VRNDSCALEPDMasked128:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked128(v)
+       case OpAMD64VRNDSCALEPDMasked256:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked256(v)
+       case OpAMD64VRNDSCALEPDMasked512:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked512(v)
+       case OpAMD64VRNDSCALEPS128:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPS128(v)
+       case OpAMD64VRNDSCALEPS256:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPS256(v)
+       case OpAMD64VRNDSCALEPS512:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPS512(v)
+       case OpAMD64VRNDSCALEPSMasked128:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked128(v)
+       case OpAMD64VRNDSCALEPSMasked256:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked256(v)
+       case OpAMD64VRNDSCALEPSMasked512:
+               return rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked512(v)
        case OpAMD64VRSQRT14PD128:
                return rewriteValueAMD64_OpAMD64VRSQRT14PD128(v)
        case OpAMD64VRSQRT14PD256:
@@ -27680,6 +27956,266 @@ func rewriteValueAMD64_OpAMD64VADDPSMasked512(v *Value) bool {
        }
        return false
 }
+func rewriteValueAMD64_OpAMD64VCMPPD512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VCMPPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VCMPPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VCMPPD512load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg3(x, ptr, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VCMPPDMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VCMPPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VCMPPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload128 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VCMPPDMasked128load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VCMPPDMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VCMPPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VCMPPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload256 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VCMPPDMasked256load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VCMPPDMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VCMPPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VCMPPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VCMPPDMasked512load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VCMPPS512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VCMPPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VCMPPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VCMPPS512load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg3(x, ptr, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VCMPPSMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VCMPPSMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VCMPPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload128 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VCMPPSMasked128load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VCMPPSMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VCMPPSMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VCMPPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload256 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VCMPPSMasked256load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VCMPPSMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VCMPPSMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VCMPPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VCMPPSMasked512load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
 func rewriteValueAMD64_OpAMD64VCVTPS2UDQ128(v *Value) bool {
        v_0 := v.Args[0]
        // match: (VCVTPS2UDQ128 l:(VMOVDQUload128 {sym} [off] ptr mem))
@@ -29250,6 +29786,354 @@ func rewriteValueAMD64_OpAMD64VFMSUBADD213PSMasked512(v *Value) bool {
        }
        return false
 }
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEINVQB128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEINVQB128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEINVQB128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEINVQB256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEINVQB256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEINVQB256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQB512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEINVQB512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEINVQB512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEINVQB512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEINVQBMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEINVQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEINVQBMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEINVQBMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEINVQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEINVQBMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEINVQBMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEINVQBMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEINVQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEINVQBMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEQB128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEQB128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEQB128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEQB256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEQB256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEQB256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQB512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEQB512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEQB512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEQB512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEQBMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEQBMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEQBMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEQBMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEQBMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEQBMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VGF2P8AFFINEQBMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VGF2P8AFFINEQBMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VGF2P8AFFINEQBMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VGF2P8AFFINEQBMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
 func rewriteValueAMD64_OpAMD64VMAXPD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
@@ -34394,6 +35278,133 @@ func rewriteValueAMD64_OpAMD64VPBROADCASTW512(v *Value) bool {
        }
        return false
 }
+func rewriteValueAMD64_OpAMD64VPCMPD512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPCMPD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPDMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload128 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPDMasked128load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPDMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload256 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPDMasked256load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPDMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPDMasked512load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
 func rewriteValueAMD64_OpAMD64VPCMPEQD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
@@ -34508,6 +35519,387 @@ func rewriteValueAMD64_OpAMD64VPCMPGTQ512(v *Value) bool {
        }
        return false
 }
+func rewriteValueAMD64_OpAMD64VPCMPQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPCMPQ512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPQMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload128 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPQMasked128load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPQMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload256 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPQMasked256load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPQMasked512load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPUD512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPUD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPUD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPCMPUD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPUDMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPUDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPUDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload128 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPUDMasked128load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPUDMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPUDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPUDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload256 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPUDMasked256load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPUDMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPUDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPUDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPUDMasked512load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPUQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPUQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPUQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPCMPUQ512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPUQMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPUQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPUQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload128 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPUQMasked128load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPUQMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPUQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPUQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload256 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPUQMasked256load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPCMPUQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPCMPUQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPCMPUQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPCMPUQMasked512load)
+                       v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
 func rewriteValueAMD64_OpAMD64VPDPBUSD512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
@@ -38788,15 +40180,14 @@ func rewriteValueAMD64_OpAMD64VPORQMasked512(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVD128(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPROLD128(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPROLVD128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPROLD128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVD128load {sym} [off] x ptr mem)
+       // result: (VPROLD128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -38807,23 +40198,22 @@ func rewriteValueAMD64_OpAMD64VPROLVD128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVD128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLD128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVD256(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPROLD256(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPROLVD256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPROLD256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVD256load {sym} [off] x ptr mem)
+       // result: (VPROLD256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -38834,23 +40224,22 @@ func rewriteValueAMD64_OpAMD64VPROLVD256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVD256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLD256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVD512(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPROLD512(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPROLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPROLD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVD512load {sym} [off] x ptr mem)
+       // result: (VPROLD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -38861,24 +40250,23 @@ func rewriteValueAMD64_OpAMD64VPROLVD512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVD512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVDMasked128(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPROLDMasked128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPROLVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPROLDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVDMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPROLDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -38886,28 +40274,27 @@ func rewriteValueAMD64_OpAMD64VPROLVDMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVDMasked128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLDMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVDMasked256(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPROLDMasked256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPROLVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPROLDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVDMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPROLDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -38915,28 +40302,27 @@ func rewriteValueAMD64_OpAMD64VPROLVDMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVDMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLDMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVDMasked512(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPROLDMasked512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPROLVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPROLDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVDMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPROLDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -38944,27 +40330,26 @@ func rewriteValueAMD64_OpAMD64VPROLVDMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVDMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLDMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVQ128(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPROLQ128(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPROLVQ128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPROLQ128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVQ128load {sym} [off] x ptr mem)
+       // result: (VPROLQ128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -38975,23 +40360,22 @@ func rewriteValueAMD64_OpAMD64VPROLVQ128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVQ128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLQ128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVQ256(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPROLQ256(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPROLVQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPROLQ256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVQ256load {sym} [off] x ptr mem)
+       // result: (VPROLQ256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -39002,23 +40386,22 @@ func rewriteValueAMD64_OpAMD64VPROLVQ256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVQ256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLQ256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVQ512(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPROLQ512(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPROLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPROLQ512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVQ512load {sym} [off] x ptr mem)
+       // result: (VPROLQ512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -39029,24 +40412,23 @@ func rewriteValueAMD64_OpAMD64VPROLVQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVQ512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLQ512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVQMasked128(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPROLQMasked128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPROLVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPROLQMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVQMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPROLQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -39054,28 +40436,27 @@ func rewriteValueAMD64_OpAMD64VPROLVQMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVQMasked128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLQMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVQMasked256(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPROLQMasked256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPROLVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPROLQMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVQMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPROLQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -39083,28 +40464,27 @@ func rewriteValueAMD64_OpAMD64VPROLVQMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVQMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLQMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPROLVQMasked512(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPROLQMasked512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPROLVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPROLQMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPROLVQMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPROLQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -39112,24 +40492,24 @@ func rewriteValueAMD64_OpAMD64VPROLVQMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPROLVQMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPROLQMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVD128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVD128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVD128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPROLVD128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVD128load {sym} [off] x ptr mem)
+       // result: (VPROLVD128load {sym} [off] x ptr mem)
        for {
                x := v_0
                l := v_1
@@ -39143,7 +40523,7 @@ func rewriteValueAMD64_OpAMD64VPRORVD128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVD128load)
+               v.reset(OpAMD64VPROLVD128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg3(x, ptr, mem)
@@ -39151,12 +40531,12 @@ func rewriteValueAMD64_OpAMD64VPRORVD128(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVD256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVD256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVD256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPROLVD256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVD256load {sym} [off] x ptr mem)
+       // result: (VPROLVD256load {sym} [off] x ptr mem)
        for {
                x := v_0
                l := v_1
@@ -39170,7 +40550,7 @@ func rewriteValueAMD64_OpAMD64VPRORVD256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVD256load)
+               v.reset(OpAMD64VPROLVD256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg3(x, ptr, mem)
@@ -39178,12 +40558,12 @@ func rewriteValueAMD64_OpAMD64VPRORVD256(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPROLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVD512load {sym} [off] x ptr mem)
+       // result: (VPROLVD512load {sym} [off] x ptr mem)
        for {
                x := v_0
                l := v_1
@@ -39197,7 +40577,7 @@ func rewriteValueAMD64_OpAMD64VPRORVD512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVD512load)
+               v.reset(OpAMD64VPROLVD512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg3(x, ptr, mem)
@@ -39205,13 +40585,13 @@ func rewriteValueAMD64_OpAMD64VPRORVD512(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVDMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVDMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPROLVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVDMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPROLVDMasked128load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -39226,7 +40606,7 @@ func rewriteValueAMD64_OpAMD64VPRORVDMasked128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVDMasked128load)
+               v.reset(OpAMD64VPROLVDMasked128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -39234,13 +40614,13 @@ func rewriteValueAMD64_OpAMD64VPRORVDMasked128(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVDMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVDMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPROLVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVDMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPROLVDMasked256load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -39255,7 +40635,7 @@ func rewriteValueAMD64_OpAMD64VPRORVDMasked256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVDMasked256load)
+               v.reset(OpAMD64VPROLVDMasked256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -39263,13 +40643,13 @@ func rewriteValueAMD64_OpAMD64VPRORVDMasked256(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVDMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVDMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPROLVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVDMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPROLVDMasked512load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -39284,7 +40664,7 @@ func rewriteValueAMD64_OpAMD64VPRORVDMasked512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVDMasked512load)
+               v.reset(OpAMD64VPROLVDMasked512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -39292,12 +40672,12 @@ func rewriteValueAMD64_OpAMD64VPRORVDMasked512(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVQ128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVQ128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVQ128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPROLVQ128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVQ128load {sym} [off] x ptr mem)
+       // result: (VPROLVQ128load {sym} [off] x ptr mem)
        for {
                x := v_0
                l := v_1
@@ -39311,7 +40691,7 @@ func rewriteValueAMD64_OpAMD64VPRORVQ128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVQ128load)
+               v.reset(OpAMD64VPROLVQ128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg3(x, ptr, mem)
@@ -39319,12 +40699,12 @@ func rewriteValueAMD64_OpAMD64VPRORVQ128(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVQ256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVQ256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPROLVQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVQ256load {sym} [off] x ptr mem)
+       // result: (VPROLVQ256load {sym} [off] x ptr mem)
        for {
                x := v_0
                l := v_1
@@ -39338,7 +40718,7 @@ func rewriteValueAMD64_OpAMD64VPRORVQ256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVQ256load)
+               v.reset(OpAMD64VPROLVQ256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg3(x, ptr, mem)
@@ -39346,12 +40726,12 @@ func rewriteValueAMD64_OpAMD64VPRORVQ256(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVQ512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVQ512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPROLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVQ512load {sym} [off] x ptr mem)
+       // result: (VPROLVQ512load {sym} [off] x ptr mem)
        for {
                x := v_0
                l := v_1
@@ -39365,7 +40745,7 @@ func rewriteValueAMD64_OpAMD64VPRORVQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVQ512load)
+               v.reset(OpAMD64VPROLVQ512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg3(x, ptr, mem)
@@ -39373,13 +40753,13 @@ func rewriteValueAMD64_OpAMD64VPRORVQ512(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVQMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVQMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPROLVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVQMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPROLVQMasked128load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -39394,7 +40774,7 @@ func rewriteValueAMD64_OpAMD64VPRORVQMasked128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVQMasked128load)
+               v.reset(OpAMD64VPROLVQMasked128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -39402,13 +40782,13 @@ func rewriteValueAMD64_OpAMD64VPRORVQMasked128(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVQMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVQMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPROLVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVQMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPROLVQMasked256load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -39423,7 +40803,7 @@ func rewriteValueAMD64_OpAMD64VPRORVQMasked256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVQMasked256load)
+               v.reset(OpAMD64VPROLVQMasked256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -39431,13 +40811,13 @@ func rewriteValueAMD64_OpAMD64VPRORVQMasked256(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPRORVQMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPROLVQMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPRORVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPROLVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPRORVQMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPROLVQMasked512load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -39452,7 +40832,7 @@ func rewriteValueAMD64_OpAMD64VPRORVQMasked512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPRORVQMasked512load)
+               v.reset(OpAMD64VPROLVQMasked512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -39460,17 +40840,14 @@ func rewriteValueAMD64_OpAMD64VPRORVQMasked512(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVD128(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPRORD128(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSHLDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPRORD128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVD128load {sym} [off] x y ptr mem)
+       // result: (VPRORD128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -39481,25 +40858,22 @@ func rewriteValueAMD64_OpAMD64VPSHLDVD128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVD128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORD128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVD256(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPRORD256(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSHLDVD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPRORD256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVD256load {sym} [off] x y ptr mem)
+       // result: (VPRORD256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -39510,25 +40884,22 @@ func rewriteValueAMD64_OpAMD64VPSHLDVD256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVD256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORD256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVD512(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPRORD512(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSHLDVD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPRORD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVD512load {sym} [off] x y ptr mem)
+       // result: (VPRORD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -39539,26 +40910,23 @@ func rewriteValueAMD64_OpAMD64VPSHLDVD512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVD512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVDMasked128(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORDMasked128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHLDVDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPRORDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVDMasked128load {sym} [off] x y ptr mask mem)
+       // result: (VPRORDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -39566,30 +40934,27 @@ func rewriteValueAMD64_OpAMD64VPSHLDVDMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVDMasked128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORDMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVDMasked256(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORDMasked256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHLDVDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPRORDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVDMasked256load {sym} [off] x y ptr mask mem)
+       // result: (VPRORDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -39597,30 +40962,27 @@ func rewriteValueAMD64_OpAMD64VPSHLDVDMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVDMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORDMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVDMasked512(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORDMasked512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHLDVDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPRORDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVDMasked512load {sym} [off] x y ptr mask mem)
+       // result: (VPRORDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -39628,29 +40990,26 @@ func rewriteValueAMD64_OpAMD64VPSHLDVDMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVDMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORDMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVQ128(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPRORQ128(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSHLDVQ128 x y l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPRORQ128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVQ128load {sym} [off] x y ptr mem)
+       // result: (VPRORQ128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -39661,25 +41020,22 @@ func rewriteValueAMD64_OpAMD64VPSHLDVQ128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVQ128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORQ128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVQ256(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPRORQ256(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSHLDVQ256 x y l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPRORQ256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVQ256load {sym} [off] x y ptr mem)
+       // result: (VPRORQ256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -39690,25 +41046,22 @@ func rewriteValueAMD64_OpAMD64VPSHLDVQ256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVQ256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORQ256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVQ512(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VPRORQ512(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSHLDVQ512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPRORQ512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVQ512load {sym} [off] x y ptr mem)
+       // result: (VPRORQ512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -39719,26 +41072,23 @@ func rewriteValueAMD64_OpAMD64VPSHLDVQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVQ512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORQ512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVQMasked128(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORQMasked128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHLDVQMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPRORQMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVQMasked128load {sym} [off] x y ptr mask mem)
+       // result: (VPRORQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -39746,30 +41096,27 @@ func rewriteValueAMD64_OpAMD64VPSHLDVQMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVQMasked128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORQMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVQMasked256(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORQMasked256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHLDVQMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPRORQMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVQMasked256load {sym} [off] x y ptr mask mem)
+       // result: (VPRORQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -39777,30 +41124,27 @@ func rewriteValueAMD64_OpAMD64VPSHLDVQMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVQMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORQMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHLDVQMasked512(v *Value) bool {
-       v_3 := v.Args[3]
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORQMasked512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHLDVQMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPRORQMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHLDVQMasked512load {sym} [off] x y ptr mask mem)
+       // result: (VPRORQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               y := v_1
-               l := v_2
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -39808,29 +41152,27 @@ func rewriteValueAMD64_OpAMD64VPSHLDVQMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHLDVQMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPRORQMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVD128(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORVD128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPRORVD128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVD128load {sym} [off] x y ptr mem)
+       // result: (VPRORVD128load {sym} [off] x ptr mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -39841,25 +41183,23 @@ func rewriteValueAMD64_OpAMD64VPSHRDVD128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVD128load)
+               v.reset(OpAMD64VPRORVD128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVD256(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORVD256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPRORVD256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVD256load {sym} [off] x y ptr mem)
+       // result: (VPRORVD256load {sym} [off] x ptr mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -39870,25 +41210,23 @@ func rewriteValueAMD64_OpAMD64VPSHRDVD256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVD256load)
+               v.reset(OpAMD64VPRORVD256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVD512(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORVD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPRORVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVD512load {sym} [off] x y ptr mem)
+       // result: (VPRORVD512load {sym} [off] x ptr mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -39899,26 +41237,24 @@ func rewriteValueAMD64_OpAMD64VPSHRDVD512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVD512load)
+               v.reset(OpAMD64VPRORVD512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVDMasked128(v *Value) bool {
-       v_3 := v.Args[3]
+func rewriteValueAMD64_OpAMD64VPRORVDMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPRORVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVDMasked128load {sym} [off] x y ptr mask mem)
+       // result: (VPRORVDMasked128load {sym} [off] x ptr mask mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -39926,30 +41262,28 @@ func rewriteValueAMD64_OpAMD64VPSHRDVDMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVDMasked128load)
+               v.reset(OpAMD64VPRORVDMasked128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVDMasked256(v *Value) bool {
-       v_3 := v.Args[3]
+func rewriteValueAMD64_OpAMD64VPRORVDMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPRORVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVDMasked256load {sym} [off] x y ptr mask mem)
+       // result: (VPRORVDMasked256load {sym} [off] x ptr mask mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -39957,30 +41291,28 @@ func rewriteValueAMD64_OpAMD64VPSHRDVDMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVDMasked256load)
+               v.reset(OpAMD64VPRORVDMasked256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVDMasked512(v *Value) bool {
-       v_3 := v.Args[3]
+func rewriteValueAMD64_OpAMD64VPRORVDMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPRORVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVDMasked512load {sym} [off] x y ptr mask mem)
+       // result: (VPRORVDMasked512load {sym} [off] x ptr mask mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -39988,29 +41320,27 @@ func rewriteValueAMD64_OpAMD64VPSHRDVDMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVDMasked512load)
+               v.reset(OpAMD64VPRORVDMasked512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVQ128(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORVQ128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVQ128 x y l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPRORVQ128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVQ128load {sym} [off] x y ptr mem)
+       // result: (VPRORVQ128load {sym} [off] x ptr mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -40021,25 +41351,23 @@ func rewriteValueAMD64_OpAMD64VPSHRDVQ128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVQ128load)
+               v.reset(OpAMD64VPRORVQ128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVQ256(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORVQ256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVQ256 x y l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPRORVQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVQ256load {sym} [off] x y ptr mem)
+       // result: (VPRORVQ256load {sym} [off] x ptr mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -40050,25 +41378,23 @@ func rewriteValueAMD64_OpAMD64VPSHRDVQ256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVQ256load)
+               v.reset(OpAMD64VPRORVQ256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVQ512(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPRORVQ512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVQ512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPRORVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVQ512load {sym} [off] x y ptr mem)
+       // result: (VPRORVQ512load {sym} [off] x ptr mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -40079,26 +41405,24 @@ func rewriteValueAMD64_OpAMD64VPSHRDVQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVQ512load)
+               v.reset(OpAMD64VPRORVQ512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, y, ptr, mem)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVQMasked128(v *Value) bool {
-       v_3 := v.Args[3]
+func rewriteValueAMD64_OpAMD64VPRORVQMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVQMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPRORVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVQMasked128load {sym} [off] x y ptr mask mem)
+       // result: (VPRORVQMasked128load {sym} [off] x ptr mask mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -40106,30 +41430,28 @@ func rewriteValueAMD64_OpAMD64VPSHRDVQMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVQMasked128load)
+               v.reset(OpAMD64VPRORVQMasked128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVQMasked256(v *Value) bool {
-       v_3 := v.Args[3]
+func rewriteValueAMD64_OpAMD64VPRORVQMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVQMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPRORVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVQMasked256load {sym} [off] x y ptr mask mem)
+       // result: (VPRORVQMasked256load {sym} [off] x ptr mask mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -40137,30 +41459,28 @@ func rewriteValueAMD64_OpAMD64VPSHRDVQMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVQMasked256load)
+               v.reset(OpAMD64VPRORVQMasked256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSHRDVQMasked512(v *Value) bool {
-       v_3 := v.Args[3]
+func rewriteValueAMD64_OpAMD64VPRORVQMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSHRDVQMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPRORVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSHRDVQMasked512load {sym} [off] x y ptr mask mem)
+       // result: (VPRORVQMasked512load {sym} [off] x ptr mask mem)
        for {
                x := v_0
-               y := v_1
-               l := v_2
+               l := v_1
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -40168,253 +41488,345 @@ func rewriteValueAMD64_OpAMD64VPSHRDVQMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_3
+               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSHRDVQMasked512load)
+               v.reset(OpAMD64VPRORVQMasked512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg5(x, y, ptr, mask, mem)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLD128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDD128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLD128 x (MOVQconst [c]))
-       // result: (VPSLLD128const [uint8(c)] x)
+       // match: (VPSHLDD128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDD128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLD128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDD128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLD256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDD256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLD256 x (MOVQconst [c]))
-       // result: (VPSLLD256const [uint8(c)] x)
+       // match: (VPSHLDD256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDD256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLD256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDD256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLD512 x (MOVQconst [c]))
-       // result: (VPSLLD512const [uint8(c)] x)
+       // match: (VPSHLDD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLD512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLDMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDDMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLDMasked128 x (MOVQconst [c]) mask)
-       // result: (VPSLLDMasked128const [uint8(c)] x mask)
+       // match: (VPSHLDDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSLLDMasked128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDDMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLDMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDDMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLDMasked256 x (MOVQconst [c]) mask)
-       // result: (VPSLLDMasked256const [uint8(c)] x mask)
+       // match: (VPSHLDDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSLLDMasked256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDDMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLDMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDDMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLDMasked512 x (MOVQconst [c]) mask)
-       // result: (VPSLLDMasked512const [uint8(c)] x mask)
+       // match: (VPSHLDDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSLLDMasked512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
-               return true
-       }
-       return false
-}
-func rewriteValueAMD64_OpAMD64VPSLLQ128(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (VPSLLQ128 x (MOVQconst [c]))
-       // result: (VPSLLQ128const [uint8(c)] x)
-       for {
-               x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLQ128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               v.reset(OpAMD64VPSHLDDMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLQ256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDQ128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLQ256 x (MOVQconst [c]))
-       // result: (VPSLLQ256const [uint8(c)] x)
+       // match: (VPSHLDQ128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDQ128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLQ256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDQ128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLQ512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDQ256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLQ512 x (MOVQconst [c]))
-       // result: (VPSLLQ512const [uint8(c)] x)
+       // match: (VPSHLDQ256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDQ256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLQ512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDQ256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLQMasked128(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPSHLDQ512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLQMasked128 x (MOVQconst [c]) mask)
-       // result: (VPSLLQMasked128const [uint8(c)] x mask)
+       // match: (VPSHLDQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               mask := v_2
-               v.reset(OpAMD64VPSLLQMasked128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDQ512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLQMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDQMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLQMasked256 x (MOVQconst [c]) mask)
-       // result: (VPSLLQMasked256const [uint8(c)] x mask)
+       // match: (VPSHLDQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSLLQMasked256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDQMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLQMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDQMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLQMasked512 x (MOVQconst [c]) mask)
-       // result: (VPSLLQMasked512const [uint8(c)] x mask)
+       // match: (VPSHLDQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSLLQMasked512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDQMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLVD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPSHLDQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSLLVD512load {sym} [off] x ptr mem)
+       // result: (VPSHLDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
                l := v_1
                if l.Op != OpAMD64VMOVDQUload512 {
@@ -40424,27 +41836,29 @@ func rewriteValueAMD64_OpAMD64VPSLLVD512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSLLVD512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPSHLDQMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLVDMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVD128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPSHLDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSLLVDMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPSHLDVD128load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -40452,28 +41866,28 @@ func rewriteValueAMD64_OpAMD64VPSLLVDMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSLLVDMasked128load)
+               v.reset(OpAMD64VPSHLDVD128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLVDMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVD256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPSHLDVD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSLLVDMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPSHLDVD256load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -40481,28 +41895,28 @@ func rewriteValueAMD64_OpAMD64VPSLLVDMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSLLVDMasked256load)
+               v.reset(OpAMD64VPSHLDVD256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLVDMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVD512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPSHLDVD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSLLVDMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPSHLDVD512load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -40510,481 +41924,617 @@ func rewriteValueAMD64_OpAMD64VPSLLVDMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSLLVDMasked512load)
+               v.reset(OpAMD64VPSHLDVD512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLVQ512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVDMasked128(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPSHLDVDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSLLVQ512load {sym} [off] x ptr mem)
+       // result: (VPSHLDVDMasked128load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload512 {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_3
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSLLVQ512load)
+               v.reset(OpAMD64VPSHLDVDMasked128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLVQMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVDMasked256(v *Value) bool {
+       v_3 := v.Args[3]
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPSHLDVDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSLLVQMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPSHLDVDMasked256load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload128 {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_3
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSLLVQMasked128load)
+               v.reset(OpAMD64VPSHLDVDMasked256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLVQMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVDMasked512(v *Value) bool {
+       v_3 := v.Args[3]
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPSHLDVDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSLLVQMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPSHLDVDMasked512load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload256 {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_3
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSLLVQMasked256load)
+               v.reset(OpAMD64VPSHLDVDMasked512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLVQMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVQ128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPSHLDVQ128 x y l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSLLVQMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPSHLDVQ128load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload512 {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSLLVQMasked512load)
+               v.reset(OpAMD64VPSHLDVQ128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLW128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVQ256(v *Value) bool {
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLW128 x (MOVQconst [c]))
-       // result: (VPSLLW128const [uint8(c)] x)
+       // match: (VPSHLDVQ256 x y l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDVQ256load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLW128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDVQ256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLW256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVQ512(v *Value) bool {
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLW256 x (MOVQconst [c]))
-       // result: (VPSLLW256const [uint8(c)] x)
+       // match: (VPSHLDVQ512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDVQ512load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLW256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
-               return true
-       }
-       return false
-}
-func rewriteValueAMD64_OpAMD64VPSLLW512(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (VPSLLW512 x (MOVQconst [c]))
-       // result: (VPSLLW512const [uint8(c)] x)
-       for {
-               x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSLLW512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               v.reset(OpAMD64VPSHLDVQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLWMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVQMasked128(v *Value) bool {
+       v_3 := v.Args[3]
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLWMasked128 x (MOVQconst [c]) mask)
-       // result: (VPSLLWMasked128const [uint8(c)] x mask)
+       // match: (VPSHLDVQMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDVQMasked128load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               mask := v_2
-               v.reset(OpAMD64VPSLLWMasked128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_3
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDVQMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLWMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVQMasked256(v *Value) bool {
+       v_3 := v.Args[3]
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLWMasked256 x (MOVQconst [c]) mask)
-       // result: (VPSLLWMasked256const [uint8(c)] x mask)
+       // match: (VPSHLDVQMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDVQMasked256load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               mask := v_2
-               v.reset(OpAMD64VPSLLWMasked256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_3
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDVQMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSLLWMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHLDVQMasked512(v *Value) bool {
+       v_3 := v.Args[3]
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSLLWMasked512 x (MOVQconst [c]) mask)
-       // result: (VPSLLWMasked512const [uint8(c)] x mask)
+       // match: (VPSHLDVQMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHLDVQMasked512load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               mask := v_2
-               v.reset(OpAMD64VPSLLWMasked512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_3
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHLDVQMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAD128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDD128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAD128 x (MOVQconst [c]))
-       // result: (VPSRAD128const [uint8(c)] x)
+       // match: (VPSHRDD128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDD128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAD128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDD128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAD256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDD256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAD256 x (MOVQconst [c]))
-       // result: (VPSRAD256const [uint8(c)] x)
+       // match: (VPSHRDD256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDD256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAD256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDD256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAD512 x (MOVQconst [c]))
-       // result: (VPSRAD512const [uint8(c)] x)
+       // match: (VPSHRDD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAD512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRADMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDDMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRADMasked128 x (MOVQconst [c]) mask)
-       // result: (VPSRADMasked128const [uint8(c)] x mask)
+       // match: (VPSHRDDMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSRADMasked128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDDMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRADMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDDMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRADMasked256 x (MOVQconst [c]) mask)
-       // result: (VPSRADMasked256const [uint8(c)] x mask)
+       // match: (VPSHRDDMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSRADMasked256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDDMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRADMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDDMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRADMasked512 x (MOVQconst [c]) mask)
-       // result: (VPSRADMasked512const [uint8(c)] x mask)
+       // match: (VPSHRDDMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSRADMasked512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
-               return true
-       }
-       return false
-}
-func rewriteValueAMD64_OpAMD64VPSRAQ128(v *Value) bool {
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (VPSRAQ128 x (MOVQconst [c]))
-       // result: (VPSRAQ128const [uint8(c)] x)
-       for {
-               x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAQ128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               v.reset(OpAMD64VPSHRDDMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAQ256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDQ128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAQ256 x (MOVQconst [c]))
-       // result: (VPSRAQ256const [uint8(c)] x)
+       // match: (VPSHRDQ128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDQ128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAQ256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDQ128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAQ512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDQ256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAQ512 x (MOVQconst [c]))
-       // result: (VPSRAQ512const [uint8(c)] x)
+       // match: (VPSHRDQ256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDQ256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAQ512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg(x)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDQ256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAQMasked128(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VPSHRDQ512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAQMasked128 x (MOVQconst [c]) mask)
-       // result: (VPSRAQMasked128const [uint8(c)] x mask)
+       // match: (VPSHRDQ512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
-               mask := v_2
-               v.reset(OpAMD64VPSRAQMasked128const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDQ512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAQMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDQMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAQMasked256 x (MOVQconst [c]) mask)
-       // result: (VPSRAQMasked256const [uint8(c)] x mask)
+       // match: (VPSHRDQMasked128 [c] x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDQMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSRAQMasked256const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDQMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAQMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDQMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAQMasked512 x (MOVQconst [c]) mask)
-       // result: (VPSRAQMasked512const [uint8(c)] x mask)
+       // match: (VPSHRDQMasked256 [c] x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDQMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
-               c := auxIntToInt64(v_1.AuxInt)
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
                mask := v_2
-               v.reset(OpAMD64VPSRAQMasked512const)
-               v.AuxInt = uint8ToAuxInt(uint8(c))
-               v.AddArg2(x, mask)
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDQMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPSHRDQMasked512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVD512load {sym} [off] x ptr mem)
+       // result: (VPSHRDQMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                x := v_0
                l := v_1
                if l.Op != OpAMD64VMOVDQUload512 {
@@ -40994,27 +42544,29 @@ func rewriteValueAMD64_OpAMD64VPSRAVD512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVD512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPSHRDQMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg4(x, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVDMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVD128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPSHRDVD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVDMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPSHRDVD128load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -41022,28 +42574,28 @@ func rewriteValueAMD64_OpAMD64VPSRAVDMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVDMasked128load)
+               v.reset(OpAMD64VPSHRDVD128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVDMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVD256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPSHRDVD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVDMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPSHRDVD256load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -41051,28 +42603,28 @@ func rewriteValueAMD64_OpAMD64VPSRAVDMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVDMasked256load)
+               v.reset(OpAMD64VPSHRDVD256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVDMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVD512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPSHRDVD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVDMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPSHRDVD512load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -41080,27 +42632,29 @@ func rewriteValueAMD64_OpAMD64VPSRAVDMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVDMasked512load)
+               v.reset(OpAMD64VPSHRDVD512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVQ128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVDMasked128(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVQ128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VPSHRDVDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVQ128load {sym} [off] x ptr mem)
+       // result: (VPSHRDVDMasked128load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -41108,26 +42662,30 @@ func rewriteValueAMD64_OpAMD64VPSRAVQ128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_3
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVQ128load)
+               v.reset(OpAMD64VPSHRDVDMasked128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVQ256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVDMasked256(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VPSHRDVDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVQ256load {sym} [off] x ptr mem)
+       // result: (VPSHRDVDMasked256load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -41135,26 +42693,30 @@ func rewriteValueAMD64_OpAMD64VPSRAVQ256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_3
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVQ256load)
+               v.reset(OpAMD64VPSHRDVDMasked256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVQ512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVDMasked512(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPSHRDVDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVQ512load {sym} [off] x ptr mem)
+       // result: (VPSHRDVDMasked512load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -41162,27 +42724,29 @@ func rewriteValueAMD64_OpAMD64VPSRAVQ512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_3
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVQ512load)
+               v.reset(OpAMD64VPSHRDVDMasked512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg5(x, y, ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVQMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVQ128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPSHRDVQ128 x y l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVQMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPSHRDVQ128load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -41190,28 +42754,28 @@ func rewriteValueAMD64_OpAMD64VPSRAVQMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVQMasked128load)
+               v.reset(OpAMD64VPSHRDVQ128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVQMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVQ256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPSHRDVQ256 x y l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVQMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPSHRDVQ256load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -41219,28 +42783,28 @@ func rewriteValueAMD64_OpAMD64VPSRAVQMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVQMasked256load)
+               v.reset(OpAMD64VPSHRDVQ256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAVQMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVQ512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPSHRDVQ512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRAVQMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPSHRDVQ512load {sym} [off] x y ptr mem)
        for {
                x := v_0
-               l := v_1
+               y := v_1
+               l := v_2
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -41248,78 +42812,306 @@ func rewriteValueAMD64_OpAMD64VPSRAVQMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRAVQMasked512load)
+               v.reset(OpAMD64VPSHRDVQ512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg4(x, y, ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAW128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSHRDVQMasked128(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAW128 x (MOVQconst [c]))
-       // result: (VPSRAW128const [uint8(c)] x)
+       // match: (VPSHRDVQMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDVQMasked128load {sym} [off] x y ptr mask mem)
        for {
                x := v_0
-               if v_1.Op != OpAMD64MOVQconst {
-                       break
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_3
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDVQMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg5(x, y, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSHRDVQMasked256(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSHRDVQMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDVQMasked256load {sym} [off] x y ptr mask mem)
+       for {
+               x := v_0
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_3
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDVQMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg5(x, y, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSHRDVQMasked512(v *Value) bool {
+       v_3 := v.Args[3]
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSHRDVQMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHRDVQMasked512load {sym} [off] x y ptr mask mem)
+       for {
+               x := v_0
+               y := v_1
+               l := v_2
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_3
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHRDVQMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg5(x, y, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSHUFD512(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHUFD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSHUFDMasked128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHUFDMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSHUFDMasked256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHUFDMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSHUFDMasked512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSHUFDMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLD128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLD128 x (MOVQconst [c]))
+       // result: (VPSLLD128const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
                }
                c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAW128const)
+               v.reset(OpAMD64VPSLLD128const)
                v.AuxInt = uint8ToAuxInt(uint8(c))
                v.AddArg(x)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAW256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLD256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAW256 x (MOVQconst [c]))
-       // result: (VPSRAW256const [uint8(c)] x)
+       // match: (VPSLLD256 x (MOVQconst [c]))
+       // result: (VPSLLD256const [uint8(c)] x)
        for {
                x := v_0
                if v_1.Op != OpAMD64MOVQconst {
                        break
                }
                c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAW256const)
+               v.reset(OpAMD64VPSLLD256const)
                v.AuxInt = uint8ToAuxInt(uint8(c))
                v.AddArg(x)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAW512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAW512 x (MOVQconst [c]))
-       // result: (VPSRAW512const [uint8(c)] x)
+       // match: (VPSLLD512 x (MOVQconst [c]))
+       // result: (VPSLLD512const [uint8(c)] x)
        for {
                x := v_0
                if v_1.Op != OpAMD64MOVQconst {
                        break
                }
                c := auxIntToInt64(v_1.AuxInt)
-               v.reset(OpAMD64VPSRAW512const)
+               v.reset(OpAMD64VPSLLD512const)
                v.AuxInt = uint8ToAuxInt(uint8(c))
                v.AddArg(x)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAWMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLD512const(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSLLD512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLDMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAWMasked128 x (MOVQconst [c]) mask)
-       // result: (VPSRAWMasked128const [uint8(c)] x mask)
+       // match: (VPSLLDMasked128 x (MOVQconst [c]) mask)
+       // result: (VPSLLDMasked128const [uint8(c)] x mask)
        for {
                x := v_0
                if v_1.Op != OpAMD64MOVQconst {
@@ -41327,19 +43119,47 @@ func rewriteValueAMD64_OpAMD64VPSRAWMasked128(v *Value) bool {
                }
                c := auxIntToInt64(v_1.AuxInt)
                mask := v_2
-               v.reset(OpAMD64VPSRAWMasked128const)
+               v.reset(OpAMD64VPSLLDMasked128const)
                v.AuxInt = uint8ToAuxInt(uint8(c))
                v.AddArg2(x, mask)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAWMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLDMasked128const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSLLDMasked128constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLDMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAWMasked256 x (MOVQconst [c]) mask)
-       // result: (VPSRAWMasked256const [uint8(c)] x mask)
+       // match: (VPSLLDMasked256 x (MOVQconst [c]) mask)
+       // result: (VPSLLDMasked256const [uint8(c)] x mask)
        for {
                x := v_0
                if v_1.Op != OpAMD64MOVQconst {
@@ -41347,19 +43167,47 @@ func rewriteValueAMD64_OpAMD64VPSRAWMasked256(v *Value) bool {
                }
                c := auxIntToInt64(v_1.AuxInt)
                mask := v_2
-               v.reset(OpAMD64VPSRAWMasked256const)
+               v.reset(OpAMD64VPSLLDMasked256const)
                v.AuxInt = uint8ToAuxInt(uint8(c))
                v.AddArg2(x, mask)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRAWMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLDMasked256const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLDMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSLLDMasked256constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLDMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRAWMasked512 x (MOVQconst [c]) mask)
-       // result: (VPSRAWMasked512const [uint8(c)] x mask)
+       // match: (VPSLLDMasked512 x (MOVQconst [c]) mask)
+       // result: (VPSLLDMasked512const [uint8(c)] x mask)
        for {
                x := v_0
                if v_1.Op != OpAMD64MOVQconst {
@@ -41367,22 +43215,103 @@ func rewriteValueAMD64_OpAMD64VPSRAWMasked512(v *Value) bool {
                }
                c := auxIntToInt64(v_1.AuxInt)
                mask := v_2
-               v.reset(OpAMD64VPSRAWMasked512const)
+               v.reset(OpAMD64VPSLLDMasked512const)
                v.AuxInt = uint8ToAuxInt(uint8(c))
                v.AddArg2(x, mask)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRLVD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLDMasked512const(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPSLLDMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRLVD512load {sym} [off] x ptr mem)
+       // result: (VPSLLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSLLDMasked512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLQ128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLQ128 x (MOVQconst [c]))
+       // result: (VPSLLQ128const [uint8(c)] x)
        for {
                x := v_0
-               l := v_1
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSLLQ128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLQ256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLQ256 x (MOVQconst [c]))
+       // result: (VPSLLQ256const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSLLQ256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLQ512 x (MOVQconst [c]))
+       // result: (VPSLLQ512const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSLLQ512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLQ512const(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -41393,24 +43322,43 @@ func rewriteValueAMD64_OpAMD64VPSRLVD512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRLVD512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPSLLQ512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRLVDMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLQMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRLVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
-       // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRLVDMasked128load {sym} [off] x ptr mask mem)
+       // match: (VPSLLQMasked128 x (MOVQconst [c]) mask)
+       // result: (VPSLLQMasked128const [uint8(c)] x mask)
        for {
                x := v_0
-               l := v_1
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSLLQMasked128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLQMasked128const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLQMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -41418,28 +43366,47 @@ func rewriteValueAMD64_OpAMD64VPSRLVDMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRLVDMasked128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPSLLQMasked128constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRLVDMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLQMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRLVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
-       // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRLVDMasked256load {sym} [off] x ptr mask mem)
+       // match: (VPSLLQMasked256 x (MOVQconst [c]) mask)
+       // result: (VPSLLQMasked256const [uint8(c)] x mask)
        for {
                x := v_0
-               l := v_1
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSLLQMasked256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLQMasked256const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLQMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -41447,28 +43414,47 @@ func rewriteValueAMD64_OpAMD64VPSRLVDMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRLVDMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPSLLQMasked256constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRLVDMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLQMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRLVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
-       // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRLVDMasked512load {sym} [off] x ptr mask mem)
+       // match: (VPSLLQMasked512 x (MOVQconst [c]) mask)
+       // result: (VPSLLQMasked512const [uint8(c)] x mask)
        for {
                x := v_0
-               l := v_1
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSLLQMasked512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLQMasked512const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLQMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -41476,24 +43462,24 @@ func rewriteValueAMD64_OpAMD64VPSRLVDMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRLVDMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VPSLLQMasked512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRLVQ512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLVD512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPSLLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRLVQ512load {sym} [off] x ptr mem)
+       // result: (VPSLLVD512load {sym} [off] x ptr mem)
        for {
                x := v_0
                l := v_1
@@ -41507,7 +43493,7 @@ func rewriteValueAMD64_OpAMD64VPSRLVQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRLVQ512load)
+               v.reset(OpAMD64VPSLLVD512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg3(x, ptr, mem)
@@ -41515,13 +43501,13 @@ func rewriteValueAMD64_OpAMD64VPSRLVQ512(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRLVQMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLVDMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRLVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPSLLVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRLVQMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPSLLVDMasked128load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -41536,7 +43522,7 @@ func rewriteValueAMD64_OpAMD64VPSRLVQMasked128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRLVQMasked128load)
+               v.reset(OpAMD64VPSLLVDMasked128load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -41544,13 +43530,13 @@ func rewriteValueAMD64_OpAMD64VPSRLVQMasked128(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRLVQMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLVDMasked256(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRLVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VPSLLVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRLVQMasked256load {sym} [off] x ptr mask mem)
+       // result: (VPSLLVDMasked256load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -41565,7 +43551,7 @@ func rewriteValueAMD64_OpAMD64VPSRLVQMasked256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRLVQMasked256load)
+               v.reset(OpAMD64VPSLLVDMasked256load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -41573,13 +43559,13 @@ func rewriteValueAMD64_OpAMD64VPSRLVQMasked256(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSRLVQMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLVDMasked512(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSRLVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VPSLLVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSRLVQMasked512load {sym} [off] x ptr mask mem)
+       // result: (VPSLLVDMasked512load {sym} [off] x ptr mask mem)
        for {
                x := v_0
                l := v_1
@@ -41594,7 +43580,7 @@ func rewriteValueAMD64_OpAMD64VPSRLVQMasked512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSRLVQMasked512load)
+               v.reset(OpAMD64VPSLLVDMasked512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg4(x, ptr, mask, mem)
@@ -41602,12 +43588,12 @@ func rewriteValueAMD64_OpAMD64VPSRLVQMasked512(v *Value) bool {
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSUBD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VPSLLVQ512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSUBD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VPSLLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSUBD512load {sym} [off] x ptr mem)
+       // result: (VPSLLVQ512load {sym} [off] x ptr mem)
        for {
                x := v_0
                l := v_1
@@ -41621,7 +43607,1579 @@ func rewriteValueAMD64_OpAMD64VPSUBD512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSUBD512load)
+               v.reset(OpAMD64VPSLLVQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLVQMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLVQMasked128load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSLLVQMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLVQMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLVQMasked256load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSLLVQMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLVQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSLLVQMasked512load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSLLVQMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLW128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLW128 x (MOVQconst [c]))
+       // result: (VPSLLW128const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSLLW128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLW256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLW256 x (MOVQconst [c]))
+       // result: (VPSLLW256const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSLLW256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLW512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLW512 x (MOVQconst [c]))
+       // result: (VPSLLW512const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSLLW512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLWMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLWMasked128 x (MOVQconst [c]) mask)
+       // result: (VPSLLWMasked128const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSLLWMasked128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLWMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLWMasked256 x (MOVQconst [c]) mask)
+       // result: (VPSLLWMasked256const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSLLWMasked256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSLLWMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSLLWMasked512 x (MOVQconst [c]) mask)
+       // result: (VPSLLWMasked512const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSLLWMasked512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAD128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAD128 x (MOVQconst [c]))
+       // result: (VPSRAD128const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAD128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAD256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAD256 x (MOVQconst [c]))
+       // result: (VPSRAD256const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAD256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAD512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAD512 x (MOVQconst [c]))
+       // result: (VPSRAD512const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAD512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAD512const(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSRAD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAD512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRADMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRADMasked128 x (MOVQconst [c]) mask)
+       // result: (VPSRADMasked128const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRADMasked128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRADMasked128const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRADMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRADMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRADMasked128constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRADMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRADMasked256 x (MOVQconst [c]) mask)
+       // result: (VPSRADMasked256const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRADMasked256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRADMasked256const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRADMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRADMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRADMasked256constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRADMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRADMasked512 x (MOVQconst [c]) mask)
+       // result: (VPSRADMasked512const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRADMasked512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRADMasked512const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRADMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRADMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRADMasked512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQ128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQ128 x (MOVQconst [c]))
+       // result: (VPSRAQ128const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAQ128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQ128const(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSRAQ128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAQ128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAQ128constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQ256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQ256 x (MOVQconst [c]))
+       // result: (VPSRAQ256const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAQ256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQ256const(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSRAQ256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAQ256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAQ256constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQ512 x (MOVQconst [c]))
+       // result: (VPSRAQ512const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAQ512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQ512const(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSRAQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAQ512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQMasked128 x (MOVQconst [c]) mask)
+       // result: (VPSRAQMasked128const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRAQMasked128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQMasked128const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAQMasked128constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQMasked256 x (MOVQconst [c]) mask)
+       // result: (VPSRAQMasked256const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRAQMasked256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQMasked256const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAQMasked256constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQMasked512 x (MOVQconst [c]) mask)
+       // result: (VPSRAQMasked512const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRAQMasked512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAQMasked512const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAQMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAQMasked512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVD512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVD512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVD512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVDMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVDMasked128load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVDMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVDMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVDMasked256load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVDMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVDMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVDMasked512load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVDMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVQ128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVQ128 x l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVQ128load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVQ128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVQ256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVQ256load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVQ256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVQ512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVQMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVQMasked128load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVQMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVQMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVQMasked256load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVQMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAVQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRAVQMasked512load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRAVQMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAW128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAW128 x (MOVQconst [c]))
+       // result: (VPSRAW128const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAW128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAW256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAW256 x (MOVQconst [c]))
+       // result: (VPSRAW256const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAW256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAW512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAW512 x (MOVQconst [c]))
+       // result: (VPSRAW512const [uint8(c)] x)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               v.reset(OpAMD64VPSRAW512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAWMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAWMasked128 x (MOVQconst [c]) mask)
+       // result: (VPSRAWMasked128const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRAWMasked128const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAWMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAWMasked256 x (MOVQconst [c]) mask)
+       // result: (VPSRAWMasked256const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRAWMasked256const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRAWMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRAWMasked512 x (MOVQconst [c]) mask)
+       // result: (VPSRAWMasked512const [uint8(c)] x mask)
+       for {
+               x := v_0
+               if v_1.Op != OpAMD64MOVQconst {
+                       break
+               }
+               c := auxIntToInt64(v_1.AuxInt)
+               mask := v_2
+               v.reset(OpAMD64VPSRAWMasked512const)
+               v.AuxInt = uint8ToAuxInt(uint8(c))
+               v.AddArg2(x, mask)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLD512const(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSRLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLD512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLDMasked128const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLDMasked128constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLDMasked256const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLDMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLDMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLDMasked256constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLDMasked512const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLDMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLDMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLDMasked512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLQ512const(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VPSRLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLQ512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLQMasked128const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLQMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLQMasked128constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLQMasked256const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLQMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLQMasked256constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLQMasked512const(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLQMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
+       for {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLQMasked512constload)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLVD512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLVD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLVD512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLVD512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLVDMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLVDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLVDMasked128load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLVDMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLVDMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLVDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLVDMasked256load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLVDMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLVDMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLVDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLVDMasked512load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLVDMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLVQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLVQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLVQ512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLVQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLVQMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLVQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLVQMasked128load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLVQMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLVQMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLVQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLVQMasked256load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLVQMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSRLVQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSRLVQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSRLVQMasked512load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSRLVQMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSUBD512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSUBD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSUBD512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSUBD512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
                v.AddArg3(x, ptr, mem)
@@ -41633,98 +45191,885 @@ func rewriteValueAMD64_OpAMD64VPSUBDMasked128(v *Value) bool {
        v_2 := v.Args[2]
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSUBDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VPSUBDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSUBDMasked128load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSUBDMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSUBDMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSUBDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSUBDMasked256load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSUBDMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSUBDMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSUBDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSUBDMasked512load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSUBDMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSUBQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSUBQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSUBQ512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSUBQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSUBQMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSUBQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSUBQMasked128load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSUBQMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSUBQMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSUBQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSUBQMasked256load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSUBQMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPSUBQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPSUBQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPSUBQMasked512load {sym} [off] x ptr mask mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_2
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPSUBQMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg4(x, ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPUNPCKHDQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPUNPCKHDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPUNPCKHDQ512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPUNPCKHDQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPUNPCKHQDQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPUNPCKHQDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPUNPCKHQDQ512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPUNPCKHQDQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPUNPCKLDQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPUNPCKLDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPUNPCKLDQ512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPUNPCKLDQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPUNPCKLQDQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPUNPCKLQDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPUNPCKLQDQ512load {sym} [off] x ptr mem)
+       for {
+               x := v_0
+               l := v_1
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VPUNPCKLQDQ512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(x, ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPXORD512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPXORD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPXORD512load {sym} [off] x ptr mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPXORD512load)
+                       v.AuxInt = int32ToAuxInt(off)
+                       v.Aux = symToAux(sym)
+                       v.AddArg3(x, ptr, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPXORDMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPXORDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSUBDMasked128load {sym} [off] x ptr mask mem)
+       // result: (VPXORDMasked128load {sym} [off] x ptr mask mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload128 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPXORDMasked128load)
+                       v.AuxInt = int32ToAuxInt(off)
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPXORDMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPXORDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPXORDMasked256load {sym} [off] x ptr mask mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload256 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPXORDMasked256load)
+                       v.AuxInt = int32ToAuxInt(off)
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPXORDMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPXORDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPXORDMasked512load {sym} [off] x ptr mask mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPXORDMasked512load)
+                       v.AuxInt = int32ToAuxInt(off)
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPXORQ512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPXORQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPXORQ512load {sym} [off] x ptr mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPXORQ512load)
+                       v.AuxInt = int32ToAuxInt(off)
+                       v.Aux = symToAux(sym)
+                       v.AddArg3(x, ptr, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPXORQMasked128(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPXORQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPXORQMasked128load {sym} [off] x ptr mask mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload128 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPXORQMasked128load)
+                       v.AuxInt = int32ToAuxInt(off)
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPXORQMasked256(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPXORQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPXORQMasked256load {sym} [off] x ptr mask mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload256 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPXORQMasked256load)
+                       v.AuxInt = int32ToAuxInt(off)
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VPXORQMasked512(v *Value) bool {
+       v_2 := v.Args[2]
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VPXORQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VPXORQMasked512load {sym} [off] x ptr mask mem)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       l := v_1
+                       if l.Op != OpAMD64VMOVDQUload512 {
+                               continue
+                       }
+                       off := auxIntToInt32(l.AuxInt)
+                       sym := auxToSym(l.Aux)
+                       mem := l.Args[1]
+                       ptr := l.Args[0]
+                       mask := v_2
+                       if !(canMergeLoad(v, l) && clobber(l)) {
+                               continue
+                       }
+                       v.reset(OpAMD64VPXORQMasked512load)
+                       v.AuxInt = int32ToAuxInt(off)
+                       v.Aux = symToAux(sym)
+                       v.AddArg4(x, ptr, mask, mem)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PD128(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VRCP14PD128 l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PD128load {sym} [off] ptr mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PD128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PD256(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VRCP14PD256 l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PD256load {sym} [off] ptr mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PD256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PD512(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VRCP14PD512 l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PD512load {sym} [off] ptr mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PD512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PDMasked128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VRCP14PDMasked128 l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PDMasked128load {sym} [off] ptr mask mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PDMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PDMasked256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VRCP14PDMasked256 l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PDMasked256load {sym} [off] ptr mask mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PDMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PDMasked512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VRCP14PDMasked512 l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PDMasked512load {sym} [off] ptr mask mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PDMasked512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PS512(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (VRCP14PS512 l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PS512load {sym} [off] ptr mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PS512load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PSMasked128(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VRCP14PSMasked128 l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PSMasked128load {sym} [off] ptr mask mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PSMasked128load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PSMasked256(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VRCP14PSMasked256 l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PSMasked256load {sym} [off] ptr mask mem)
+       for {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
+               }
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
+               }
+               v.reset(OpAMD64VRCP14PSMasked256load)
+               v.AuxInt = int32ToAuxInt(off)
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
+       }
+       return false
+}
+func rewriteValueAMD64_OpAMD64VRCP14PSMasked512(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (VRCP14PSMasked512 l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // cond: canMergeLoad(v, l) && clobber(l)
+       // result: (VRCP14PSMasked512load {sym} [off] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload128 {
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSUBDMasked128load)
+               v.reset(OpAMD64VRCP14PSMasked512load)
                v.AuxInt = int32ToAuxInt(off)
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSUBDMasked256(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VREDUCEPD128(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSUBDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VREDUCEPD128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSUBDMasked256load {sym} [off] x ptr mask mem)
+       // result: (VREDUCEPD128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload256 {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSUBDMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPD128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSUBDMasked512(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VREDUCEPD256(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSUBDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VREDUCEPD256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSUBDMasked512load {sym} [off] x ptr mask mem)
+       // result: (VREDUCEPD256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload512 {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSUBDMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPD256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSUBQ512(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VREDUCEPD512(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPSUBQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VREDUCEPD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSUBQ512load {sym} [off] x ptr mem)
+       // result: (VREDUCEPD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -41735,24 +46080,23 @@ func rewriteValueAMD64_OpAMD64VPSUBQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSUBQ512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSUBQMasked128(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VREDUCEPDMasked128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSUBQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VREDUCEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSUBQMasked128load {sym} [off] x ptr mask mem)
+       // result: (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
@@ -41760,28 +46104,27 @@ func rewriteValueAMD64_OpAMD64VPSUBQMasked128(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSUBQMasked128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPDMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSUBQMasked256(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VREDUCEPDMasked256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSUBQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VREDUCEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSUBQMasked256load {sym} [off] x ptr mask mem)
+       // result: (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
@@ -41789,28 +46132,27 @@ func rewriteValueAMD64_OpAMD64VPSUBQMasked256(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSUBQMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPDMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPSUBQMasked512(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VREDUCEPDMasked512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPSUBQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VREDUCEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPSUBQMasked512load {sym} [off] x ptr mask mem)
+       // result: (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -41818,28 +46160,27 @@ func rewriteValueAMD64_OpAMD64VPSUBQMasked512(v *Value) bool {
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_2
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPSUBQMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPDMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg4(x, ptr, mask, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPUNPCKHDQ512(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VREDUCEPS128(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPUNPCKHDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VREDUCEPS128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPUNPCKHDQ512load {sym} [off] x ptr mem)
+       // result: (VREDUCEPS128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload512 {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
@@ -41849,24 +46190,23 @@ func rewriteValueAMD64_OpAMD64VPUNPCKHDQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPUNPCKHDQ512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPS128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPUNPCKHQDQ512(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VREDUCEPS256(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPUNPCKHQDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VREDUCEPS256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPUNPCKHQDQ512load {sym} [off] x ptr mem)
+       // result: (VREDUCEPS256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload512 {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
@@ -41876,23 +46216,22 @@ func rewriteValueAMD64_OpAMD64VPUNPCKHQDQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPUNPCKHQDQ512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPS256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPUNPCKLDQ512(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VREDUCEPS512(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPUNPCKLDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VREDUCEPS512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPUNPCKLDQ512load {sym} [off] x ptr mem)
+       // result: (VREDUCEPS512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               x := v_0
-               l := v_1
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
@@ -41903,301 +46242,159 @@ func rewriteValueAMD64_OpAMD64VPUNPCKLDQ512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPUNPCKLDQ512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPS512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPUNPCKLQDQ512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VREDUCEPSMasked128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPUNPCKLQDQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VREDUCEPSMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPUNPCKLQDQ512load {sym} [off] x ptr mem)
+       // result: (VREDUCEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               x := v_0
-               l := v_1
-               if l.Op != OpAMD64VMOVDQUload512 {
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VPUNPCKLQDQ512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VREDUCEPSMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(x, ptr, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPXORD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VREDUCEPSMasked256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPXORD512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VREDUCEPSMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPXORD512load {sym} [off] x ptr mem)
+       // result: (VREDUCEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-                       x := v_0
-                       l := v_1
-                       if l.Op != OpAMD64VMOVDQUload512 {
-                               continue
-                       }
-                       off := auxIntToInt32(l.AuxInt)
-                       sym := auxToSym(l.Aux)
-                       mem := l.Args[1]
-                       ptr := l.Args[0]
-                       if !(canMergeLoad(v, l) && clobber(l)) {
-                               continue
-                       }
-                       v.reset(OpAMD64VPXORD512load)
-                       v.AuxInt = int32ToAuxInt(off)
-                       v.Aux = symToAux(sym)
-                       v.AddArg3(x, ptr, mem)
-                       return true
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
                }
-               break
-       }
-       return false
-}
-func rewriteValueAMD64_OpAMD64VPXORDMasked128(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (VPXORDMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
-       // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPXORDMasked128load {sym} [off] x ptr mask mem)
-       for {
-               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-                       x := v_0
-                       l := v_1
-                       if l.Op != OpAMD64VMOVDQUload128 {
-                               continue
-                       }
-                       off := auxIntToInt32(l.AuxInt)
-                       sym := auxToSym(l.Aux)
-                       mem := l.Args[1]
-                       ptr := l.Args[0]
-                       mask := v_2
-                       if !(canMergeLoad(v, l) && clobber(l)) {
-                               continue
-                       }
-                       v.reset(OpAMD64VPXORDMasked128load)
-                       v.AuxInt = int32ToAuxInt(off)
-                       v.Aux = symToAux(sym)
-                       v.AddArg4(x, ptr, mask, mem)
-                       return true
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
                }
-               break
+               v.reset(OpAMD64VREDUCEPSMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPXORDMasked256(v *Value) bool {
-       v_2 := v.Args[2]
+func rewriteValueAMD64_OpAMD64VREDUCEPSMasked512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VPXORDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VREDUCEPSMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPXORDMasked256load {sym} [off] x ptr mask mem)
+       // result: (VREDUCEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
-               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-                       x := v_0
-                       l := v_1
-                       if l.Op != OpAMD64VMOVDQUload256 {
-                               continue
-                       }
-                       off := auxIntToInt32(l.AuxInt)
-                       sym := auxToSym(l.Aux)
-                       mem := l.Args[1]
-                       ptr := l.Args[0]
-                       mask := v_2
-                       if !(canMergeLoad(v, l) && clobber(l)) {
-                               continue
-                       }
-                       v.reset(OpAMD64VPXORDMasked256load)
-                       v.AuxInt = int32ToAuxInt(off)
-                       v.Aux = symToAux(sym)
-                       v.AddArg4(x, ptr, mask, mem)
-                       return true
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload512 {
+                       break
                }
-               break
-       }
-       return false
-}
-func rewriteValueAMD64_OpAMD64VPXORDMasked512(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (VPXORDMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
-       // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPXORDMasked512load {sym} [off] x ptr mask mem)
-       for {
-               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-                       x := v_0
-                       l := v_1
-                       if l.Op != OpAMD64VMOVDQUload512 {
-                               continue
-                       }
-                       off := auxIntToInt32(l.AuxInt)
-                       sym := auxToSym(l.Aux)
-                       mem := l.Args[1]
-                       ptr := l.Args[0]
-                       mask := v_2
-                       if !(canMergeLoad(v, l) && clobber(l)) {
-                               continue
-                       }
-                       v.reset(OpAMD64VPXORDMasked512load)
-                       v.AuxInt = int32ToAuxInt(off)
-                       v.Aux = symToAux(sym)
-                       v.AddArg4(x, ptr, mask, mem)
-                       return true
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               mask := v_1
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
                }
-               break
+               v.reset(OpAMD64VREDUCEPSMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg3(ptr, mask, mem)
+               return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPXORQ512(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VRNDSCALEPD128(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPXORQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VRNDSCALEPD128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPXORQ512load {sym} [off] x ptr mem)
+       // result: (VRNDSCALEPD128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-                       x := v_0
-                       l := v_1
-                       if l.Op != OpAMD64VMOVDQUload512 {
-                               continue
-                       }
-                       off := auxIntToInt32(l.AuxInt)
-                       sym := auxToSym(l.Aux)
-                       mem := l.Args[1]
-                       ptr := l.Args[0]
-                       if !(canMergeLoad(v, l) && clobber(l)) {
-                               continue
-                       }
-                       v.reset(OpAMD64VPXORQ512load)
-                       v.AuxInt = int32ToAuxInt(off)
-                       v.Aux = symToAux(sym)
-                       v.AddArg3(x, ptr, mem)
-                       return true
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload128 {
+                       break
                }
-               break
-       }
-       return false
-}
-func rewriteValueAMD64_OpAMD64VPXORQMasked128(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (VPXORQMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
-       // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPXORQMasked128load {sym} [off] x ptr mask mem)
-       for {
-               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-                       x := v_0
-                       l := v_1
-                       if l.Op != OpAMD64VMOVDQUload128 {
-                               continue
-                       }
-                       off := auxIntToInt32(l.AuxInt)
-                       sym := auxToSym(l.Aux)
-                       mem := l.Args[1]
-                       ptr := l.Args[0]
-                       mask := v_2
-                       if !(canMergeLoad(v, l) && clobber(l)) {
-                               continue
-                       }
-                       v.reset(OpAMD64VPXORQMasked128load)
-                       v.AuxInt = int32ToAuxInt(off)
-                       v.Aux = symToAux(sym)
-                       v.AddArg4(x, ptr, mask, mem)
-                       return true
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
                }
-               break
+               v.reset(OpAMD64VRNDSCALEPD128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VPXORQMasked256(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VRNDSCALEPD256(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VPXORQMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VRNDSCALEPD256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPXORQMasked256load {sym} [off] x ptr mask mem)
+       // result: (VRNDSCALEPD256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
-               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-                       x := v_0
-                       l := v_1
-                       if l.Op != OpAMD64VMOVDQUload256 {
-                               continue
-                       }
-                       off := auxIntToInt32(l.AuxInt)
-                       sym := auxToSym(l.Aux)
-                       mem := l.Args[1]
-                       ptr := l.Args[0]
-                       mask := v_2
-                       if !(canMergeLoad(v, l) && clobber(l)) {
-                               continue
-                       }
-                       v.reset(OpAMD64VPXORQMasked256load)
-                       v.AuxInt = int32ToAuxInt(off)
-                       v.Aux = symToAux(sym)
-                       v.AddArg4(x, ptr, mask, mem)
-                       return true
+               c := auxIntToUint8(v.AuxInt)
+               l := v_0
+               if l.Op != OpAMD64VMOVDQUload256 {
+                       break
                }
-               break
-       }
-       return false
-}
-func rewriteValueAMD64_OpAMD64VPXORQMasked512(v *Value) bool {
-       v_2 := v.Args[2]
-       v_1 := v.Args[1]
-       v_0 := v.Args[0]
-       // match: (VPXORQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
-       // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VPXORQMasked512load {sym} [off] x ptr mask mem)
-       for {
-               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
-                       x := v_0
-                       l := v_1
-                       if l.Op != OpAMD64VMOVDQUload512 {
-                               continue
-                       }
-                       off := auxIntToInt32(l.AuxInt)
-                       sym := auxToSym(l.Aux)
-                       mem := l.Args[1]
-                       ptr := l.Args[0]
-                       mask := v_2
-                       if !(canMergeLoad(v, l) && clobber(l)) {
-                               continue
-                       }
-                       v.reset(OpAMD64VPXORQMasked512load)
-                       v.AuxInt = int32ToAuxInt(off)
-                       v.Aux = symToAux(sym)
-                       v.AddArg4(x, ptr, mask, mem)
-                       return true
+               off := auxIntToInt32(l.AuxInt)
+               sym := auxToSym(l.Aux)
+               mem := l.Args[1]
+               ptr := l.Args[0]
+               if !(canMergeLoad(v, l) && clobber(l)) {
+                       break
                }
-               break
+               v.reset(OpAMD64VRNDSCALEPD256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
+               v.Aux = symToAux(sym)
+               v.AddArg2(ptr, mem)
+               return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PD128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VRNDSCALEPD512(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VRCP14PD128 l:(VMOVDQUload128 {sym} [off] ptr mem))
+       // match: (VRNDSCALEPD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PD128load {sym} [off] ptr mem)
+       // result: (VRNDSCALEPD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
-               if l.Op != OpAMD64VMOVDQUload128 {
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
@@ -42207,73 +46404,80 @@ func rewriteValueAMD64_OpAMD64VRCP14PD128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PD128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPD512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
                v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PD256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked128(v *Value) bool {
+       v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VRCP14PD256 l:(VMOVDQUload256 {sym} [off] ptr mem))
+       // match: (VRNDSCALEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PD256load {sym} [off] ptr mem)
+       // result: (VRNDSCALEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
-               if l.Op != OpAMD64VMOVDQUload256 {
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PD256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPDMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg2(ptr, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PD512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked256(v *Value) bool {
+       v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VRCP14PD512 l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VRNDSCALEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PD512load {sym} [off] ptr mem)
+       // result: (VRNDSCALEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
-               if l.Op != OpAMD64VMOVDQUload512 {
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
+               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PD512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPDMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg2(ptr, mem)
+               v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PDMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VRNDSCALEPDMasked512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VRCP14PDMasked128 l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VRNDSCALEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PDMasked128load {sym} [off] ptr mask mem)
+       // result: (VRNDSCALEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
-               if l.Op != OpAMD64VMOVDQUload128 {
+               if l.Op != OpAMD64VMOVDQUload512 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
@@ -42284,74 +46488,73 @@ func rewriteValueAMD64_OpAMD64VRCP14PDMasked128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PDMasked128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPDMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
                v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PDMasked256(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VRNDSCALEPS128(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VRCP14PDMasked256 l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VRNDSCALEPS128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PDMasked256load {sym} [off] ptr mask mem)
+       // result: (VRNDSCALEPS128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
-               if l.Op != OpAMD64VMOVDQUload256 {
+               if l.Op != OpAMD64VMOVDQUload128 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PDMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPS128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(ptr, mask, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PDMasked512(v *Value) bool {
-       v_1 := v.Args[1]
+func rewriteValueAMD64_OpAMD64VRNDSCALEPS256(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VRCP14PDMasked512 l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VRNDSCALEPS256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PDMasked512load {sym} [off] ptr mask mem)
+       // result: (VRNDSCALEPS256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
-               if l.Op != OpAMD64VMOVDQUload512 {
+               if l.Op != OpAMD64VMOVDQUload256 {
                        break
                }
                off := auxIntToInt32(l.AuxInt)
                sym := auxToSym(l.Aux)
                mem := l.Args[1]
                ptr := l.Args[0]
-               mask := v_1
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PDMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPS256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
-               v.AddArg3(ptr, mask, mem)
+               v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PS512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VRNDSCALEPS512(v *Value) bool {
        v_0 := v.Args[0]
-       // match: (VRCP14PS512 l:(VMOVDQUload512 {sym} [off] ptr mem))
+       // match: (VRNDSCALEPS512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem))
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PS512load {sym} [off] ptr mem)
+       // result: (VRNDSCALEPS512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
@@ -42363,21 +46566,22 @@ func rewriteValueAMD64_OpAMD64VRCP14PS512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PS512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPS512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
                v.AddArg2(ptr, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PSMasked128(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked128(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VRCP14PSMasked128 l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
+       // match: (VRNDSCALEPSMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PSMasked128load {sym} [off] ptr mask mem)
+       // result: (VRNDSCALEPSMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
                if l.Op != OpAMD64VMOVDQUload128 {
                        break
@@ -42390,21 +46594,22 @@ func rewriteValueAMD64_OpAMD64VRCP14PSMasked128(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PSMasked128load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPSMasked128load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
                v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PSMasked256(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked256(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VRCP14PSMasked256 l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
+       // match: (VRNDSCALEPSMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PSMasked256load {sym} [off] ptr mask mem)
+       // result: (VRNDSCALEPSMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
                if l.Op != OpAMD64VMOVDQUload256 {
                        break
@@ -42417,21 +46622,22 @@ func rewriteValueAMD64_OpAMD64VRCP14PSMasked256(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PSMasked256load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPSMasked256load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
                v.AddArg3(ptr, mask, mem)
                return true
        }
        return false
 }
-func rewriteValueAMD64_OpAMD64VRCP14PSMasked512(v *Value) bool {
+func rewriteValueAMD64_OpAMD64VRNDSCALEPSMasked512(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
-       // match: (VRCP14PSMasked512 l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
+       // match: (VRNDSCALEPSMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
        // cond: canMergeLoad(v, l) && clobber(l)
-       // result: (VRCP14PSMasked512load {sym} [off] ptr mask mem)
+       // result: (VRNDSCALEPSMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
        for {
+               c := auxIntToUint8(v.AuxInt)
                l := v_0
                if l.Op != OpAMD64VMOVDQUload512 {
                        break
@@ -42444,8 +46650,8 @@ func rewriteValueAMD64_OpAMD64VRCP14PSMasked512(v *Value) bool {
                if !(canMergeLoad(v, l) && clobber(l)) {
                        break
                }
-               v.reset(OpAMD64VRCP14PSMasked512load)
-               v.AuxInt = int32ToAuxInt(off)
+               v.reset(OpAMD64VRNDSCALEPSMasked512load)
+               v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off))
                v.Aux = symToAux(sym)
                v.AddArg3(ptr, mask, mem)
                return true
index c9fae4eed73e50fb51c6e018281457e0fab0ef2b..2339a1910d76d4a297722886f5c1c10184b68c3b 100644 (file)
@@ -236,7 +236,7 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
                                panic("simdgen sees unknwon special lower " + *gOp.SpecialLower + ", maybe implement it?")
                        }
                }
-               if gOp.MemFeatures != nil && *gOp.MemFeatures == "vbcst" && immType == NoImm {
+               if gOp.MemFeatures != nil && *gOp.MemFeatures == "vbcst" {
                        // sanity check
                        selected := true
                        for _, a := range gOp.In {
@@ -257,9 +257,21 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
                                }
                                memOpData := data
                                // Remove the last vreg from the arg and change it to a load.
-                               memOpData.ArgsLoadAddr = data.Args[:len(data.Args)-1] + fmt.Sprintf("l:(VMOVDQUload%d {sym} [off] ptr mem)", *lastVreg.Bits)
+                               origArgs := data.Args[:len(data.Args)-1]
+                               // Prepare imm args.
+                               immArg := ""
+                               immArgCombineOff := " [off] "
+                               if immType != NoImm && immType != InvalidImm {
+                                       _, after, found := strings.Cut(origArgs, "]")
+                                       if found {
+                                               origArgs = after
+                                       }
+                                       immArg = "[c] "
+                                       immArgCombineOff = " [makeValAndOff(int32(int8(c)),off)] "
+                               }
+                               memOpData.ArgsLoadAddr = immArg + origArgs + fmt.Sprintf("l:(VMOVDQUload%d {sym} [off] ptr mem)", *lastVreg.Bits)
                                // Remove the last vreg from the arg and change it to "ptr".
-                               memOpData.ArgsAddr = "{sym} [off] " + data.Args[:len(data.Args)-1] + "ptr"
+                               memOpData.ArgsAddr = "{sym}" + immArgCombineOff + origArgs + "ptr"
                                if maskType == OneMask {
                                        memOpData.ArgsAddr += " mask"
                                        memOpData.ArgsLoadAddr += " mask"