This CL named VPALIGNR ConcatShiftBytes[Grouped].
Change-Id: I46c6703085efb0613deefa512de9911b4fdf6bc4
Reviewed-on: https://go-review.googlesource.com/c/go/+/714440
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
ssa.OpAMD64VPSRAQMasked512const:
p = simdVkvImm8(s, v)
- case ssa.OpAMD64VCMPPS128,
+ case ssa.OpAMD64VPALIGNR128,
+ ssa.OpAMD64VPALIGNR256,
+ ssa.OpAMD64VPALIGNR512,
+ ssa.OpAMD64VCMPPS128,
ssa.OpAMD64VCMPPS256,
ssa.OpAMD64VCMPPD128,
ssa.OpAMD64VCMPPD256,
ssa.OpAMD64VPAVGWMasked128Merging,
ssa.OpAMD64VPAVGWMasked256Merging,
ssa.OpAMD64VPAVGWMasked512Merging,
+ ssa.OpAMD64VPALIGNRMasked256Merging,
+ ssa.OpAMD64VPALIGNRMasked512Merging,
+ ssa.OpAMD64VPALIGNRMasked128Merging,
ssa.OpAMD64VPACKSSDWMasked128Merging,
ssa.OpAMD64VPACKSSDWMasked256Merging,
ssa.OpAMD64VPACKSSDWMasked512Merging,
ssa.OpAMD64VPEXTRW128:
p = simdVgpImm8(s, v)
- case ssa.OpAMD64VGF2P8AFFINEINVQBMasked128,
+ case ssa.OpAMD64VPALIGNRMasked256,
+ ssa.OpAMD64VPALIGNRMasked512,
+ ssa.OpAMD64VPALIGNRMasked128,
+ ssa.OpAMD64VGF2P8AFFINEINVQBMasked128,
ssa.OpAMD64VGF2P8AFFINEINVQBMasked256,
ssa.OpAMD64VGF2P8AFFINEINVQBMasked512,
ssa.OpAMD64VGF2P8AFFINEQBMasked128,
ssa.OpAMD64VPCOMPRESSQMasked128,
ssa.OpAMD64VPCOMPRESSQMasked256,
ssa.OpAMD64VPCOMPRESSQMasked512,
+ ssa.OpAMD64VPALIGNRMasked256,
+ ssa.OpAMD64VPALIGNRMasked512,
+ ssa.OpAMD64VPALIGNRMasked128,
ssa.OpAMD64VPMOVWBMasked128_128,
ssa.OpAMD64VPMOVWBMasked128_256,
ssa.OpAMD64VPMOVWBMasked256,
(CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
(CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
(CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
+(ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
+(ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
(ConvertToInt8Int16x8 ...) => (VPMOVWB128_128 ...)
(ConvertToInt8Int16x16 ...) => (VPMOVWB128_256 ...)
(ConvertToInt8Int16x32 ...) => (VPMOVWB256 ...)
(VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask)
(VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask)
(VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
+(VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask)
+(VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask)
+(VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask)
(VMOVDQU16Masked128 (VPMOVWB128_128 x) mask) => (VPMOVWBMasked128_128 x mask)
(VMOVDQU16Masked256 (VPMOVWB128_256 x) mask) => (VPMOVWBMasked128_256 x mask)
(VMOVDQU16Masked256 (VPMOVWB256 x) mask) => (VPMOVWBMasked256 x mask)
(VPBLENDMBMasked512 dst (VPADDB512 x y) mask) => (VPADDBMasked512Merging dst x y mask)
(VPBLENDMBMasked512 dst (VPADDSB512 x y) mask) => (VPADDSBMasked512Merging dst x y mask)
(VPBLENDMBMasked512 dst (VPADDUSB512 x y) mask) => (VPADDUSBMasked512Merging dst x y mask)
+(VPBLENDMBMasked512 dst (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512Merging dst [a] x y mask)
(VPBLENDMBMasked512 dst (VPAVGB512 x y) mask) => (VPAVGBMasked512Merging dst x y mask)
(VPBLENDMBMasked512 dst (VPMAXSB512 x y) mask) => (VPMAXSBMasked512Merging dst x y mask)
(VPBLENDMBMasked512 dst (VPMAXUB512 x y) mask) => (VPMAXUBMasked512Merging dst x y mask)
(VPBLENDVB128 dst (VPADDUSB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPADDUSW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSWMasked128Merging dst x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPADDW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDWMasked128Merging dst x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPALIGNR128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPALIGNRMasked128Merging dst [a] x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPAVGB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPAVGW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGWMasked128Merging dst x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPBROADCASTB128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPBROADCASTBMasked128Merging dst x (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPADDUSB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPADDUSW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSWMasked256Merging dst x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPADDW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDWMasked256Merging dst x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPALIGNR256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPALIGNRMasked256Merging dst [a] x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPAVGB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPAVGW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGWMasked256Merging dst x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPLZCNTD256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPLZCNTDMasked256Merging dst x (VPMOVVec32x8ToM <types.TypeMask> mask))
{name: "VINSERTF128256", argLength: 2, reg: v21, asm: "VINSERTF128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VINSERTI64X4512", argLength: 2, reg: w21, asm: "VINSERTI64X4", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPALIGNR128", argLength: 2, reg: v21, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPALIGNR256", argLength: 2, reg: v21, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPALIGNR512", argLength: 2, reg: w21, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPALIGNRMasked128", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPALIGNRMasked256", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPALIGNRMasked512", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPBMasked256", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VSUBPSMasked128Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VSUBPSMasked256Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VSUBPSMasked512Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", commutative: false, typ: "Vec512", resultInArg0: true},
+ {name: "VPALIGNRMasked128Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
+ {name: "VPALIGNRMasked256Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
+ {name: "VPALIGNRMasked512Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPROLDMasked128Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPROLDMasked256Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPROLDMasked512Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "CeilScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "CeilScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "CeilScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "ConcatShiftBytesRightGroupedUint8x32", argLength: 2, commutative: false, aux: "UInt8"},
+ {name: "ConcatShiftBytesRightGroupedUint8x64", argLength: 2, commutative: false, aux: "UInt8"},
+ {name: "ConcatShiftBytesRightUint8x16", argLength: 2, commutative: false, aux: "UInt8"},
{name: "FloorScaledFloat32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "FloorScaledFloat32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "FloorScaledFloat32x16", argLength: 1, commutative: false, aux: "UInt8"},
OpAMD64VINSERTF128256
OpAMD64VINSERTI64X4512
OpAMD64VINSERTI128256
+ OpAMD64VPALIGNR128
+ OpAMD64VPALIGNR256
+ OpAMD64VPALIGNR512
+ OpAMD64VPALIGNRMasked128
+ OpAMD64VPALIGNRMasked256
+ OpAMD64VPALIGNRMasked512
OpAMD64VPCMPB512
OpAMD64VPCMPBMasked128
OpAMD64VPCMPBMasked256
OpAMD64VSUBPSMasked128Merging
OpAMD64VSUBPSMasked256Merging
OpAMD64VSUBPSMasked512Merging
+ OpAMD64VPALIGNRMasked128Merging
+ OpAMD64VPALIGNRMasked256Merging
+ OpAMD64VPALIGNRMasked512Merging
OpAMD64VPROLDMasked128Merging
OpAMD64VPROLDMasked256Merging
OpAMD64VPROLDMasked512Merging
OpCeilScaledResidueFloat64x2
OpCeilScaledResidueFloat64x4
OpCeilScaledResidueFloat64x8
+ OpConcatShiftBytesRightGroupedUint8x32
+ OpConcatShiftBytesRightGroupedUint8x64
+ OpConcatShiftBytesRightUint8x16
OpFloorScaledFloat32x4
OpFloorScaledFloat32x8
OpFloorScaledFloat32x16
},
},
},
+ {
+ name: "VPALIGNR128",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VPALIGNR256",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VPALIGNR512",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPALIGNRMasked128",
+ auxType: auxUInt8,
+ argLen: 3,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPALIGNRMasked256",
+ auxType: auxUInt8,
+ argLen: 3,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPALIGNRMasked512",
+ auxType: auxUInt8,
+ argLen: 3,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
{
name: "VPCMPB512",
auxType: auxUInt8,
},
},
},
+ {
+ name: "VPALIGNRMasked128Merging",
+ auxType: auxUInt8,
+ argLen: 4,
+ resultInArg0: true,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPALIGNRMasked256Merging",
+ auxType: auxUInt8,
+ argLen: 4,
+ resultInArg0: true,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPALIGNRMasked512Merging",
+ auxType: auxUInt8,
+ argLen: 4,
+ resultInArg0: true,
+ asm: x86.AVPALIGNR,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
{
name: "VPROLDMasked128Merging",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
+ {
+ name: "ConcatShiftBytesRightGroupedUint8x32",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatShiftBytesRightGroupedUint8x64",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "ConcatShiftBytesRightUint8x16",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
{
name: "FloorScaledFloat32x4",
auxType: auxUInt8,
return rewriteValueAMD64_OpCompressUint8x32(v)
case OpCompressUint8x64:
return rewriteValueAMD64_OpCompressUint8x64(v)
+ case OpConcatShiftBytesRightGroupedUint8x32:
+ v.Op = OpAMD64VPALIGNR256
+ return true
+ case OpConcatShiftBytesRightGroupedUint8x64:
+ v.Op = OpAMD64VPALIGNR512
+ return true
+ case OpConcatShiftBytesRightUint8x16:
+ v.Op = OpAMD64VPALIGNR128
+ return true
case OpCondSelect:
return rewriteValueAMD64_OpCondSelect(v)
case OpConst16:
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask)
+ // result: (VPALIGNRMasked128 [a] x y mask)
+ for {
+ if v_0.Op != OpAMD64VPALIGNR128 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ y := v_0.Args[1]
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPALIGNRMasked128)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg3(x, y, mask)
+ return true
+ }
// match: (VMOVDQU8Masked128 (VPMOVSXBW128 x) mask)
// result: (VPMOVSXBWMasked128 x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask)
+ // result: (VPALIGNRMasked256 [a] x y mask)
+ for {
+ if v_0.Op != OpAMD64VPALIGNR256 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ y := v_0.Args[1]
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPALIGNRMasked256)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg3(x, y, mask)
+ return true
+ }
// match: (VMOVDQU8Masked256 (VPMOVSXBW256 x) mask)
// result: (VPMOVSXBWMasked256 x mask)
for {
v.AddArg2(x, mask)
return true
}
+ // match: (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask)
+ // result: (VPALIGNRMasked512 [a] x y mask)
+ for {
+ if v_0.Op != OpAMD64VPALIGNR512 {
+ break
+ }
+ a := auxIntToUint8(v_0.AuxInt)
+ y := v_0.Args[1]
+ x := v_0.Args[0]
+ mask := v_1
+ v.reset(OpAMD64VPALIGNRMasked512)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg3(x, y, mask)
+ return true
+ }
// match: (VMOVDQU8Masked512 (VPMOVSXBW512 x) mask)
// result: (VPMOVSXBWMasked512 x mask)
for {
v.AddArg4(dst, x, y, mask)
return true
}
+ // match: (VPBLENDMBMasked512 dst (VPALIGNR512 [a] x y) mask)
+ // result: (VPALIGNRMasked512Merging dst [a] x y mask)
+ for {
+ dst := v_0
+ if v_1.Op != OpAMD64VPALIGNR512 {
+ break
+ }
+ a := auxIntToUint8(v_1.AuxInt)
+ y := v_1.Args[1]
+ x := v_1.Args[0]
+ mask := v_2
+ v.reset(OpAMD64VPALIGNRMasked512Merging)
+ v.AuxInt = uint8ToAuxInt(a)
+ v.AddArg4(dst, x, y, mask)
+ return true
+ }
// match: (VPBLENDMBMasked512 dst (VPAVGB512 x y) mask)
// result: (VPAVGBMasked512Merging dst x y mask)
for {
v.AddArg4(dst, x, y, v0)
return true
}
+ // match: (VPBLENDVB128 dst (VPALIGNR128 [a] x y) mask)
+ // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+ // result: (VPALIGNRMasked128Merging dst [a] x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+ for {
+ dst := v_0
+ if v_1.Op != OpAMD64VPALIGNR128 {
+ break
+ }
+ a := auxIntToUint8(v_1.AuxInt)
+ y := v_1.Args[1]
+ x := v_1.Args[0]
+ mask := v_2
+ if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+ break
+ }
+ v.reset(OpAMD64VPALIGNRMasked128Merging)
+ v.AuxInt = uint8ToAuxInt(a)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg4(dst, x, y, v0)
+ return true
+ }
// match: (VPBLENDVB128 dst (VPAVGB128 x y) mask)
// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
// result: (VPAVGBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
v.AddArg4(dst, x, y, v0)
return true
}
+ // match: (VPBLENDVB256 dst (VPALIGNR256 [a] x y) mask)
+ // cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
+ // result: (VPALIGNRMasked256Merging dst [a] x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+ for {
+ dst := v_0
+ if v_1.Op != OpAMD64VPALIGNR256 {
+ break
+ }
+ a := auxIntToUint8(v_1.AuxInt)
+ y := v_1.Args[1]
+ x := v_1.Args[0]
+ mask := v_2
+ if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) {
+ break
+ }
+ v.reset(OpAMD64VPALIGNRMasked256Merging)
+ v.AuxInt = uint8ToAuxInt(a)
+ v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+ v0.AddArg(mask)
+ v.AddArg4(dst, x, y, v0)
+ return true
+ }
// match: (VPBLENDVB256 dst (VPAVGB256 x y) mask)
// cond: v.Block.CPUfeatures.hasFeature(CPUavx512)
// result: (VPAVGBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.ConvertToInt8", opLen1(ssa.OpConvertToInt8Int16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.ConvertToInt8", opLen1(ssa.OpConvertToInt8Int16x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x32.ConvertToInt8", opLen1(ssa.OpConvertToInt8Int16x32, types.TypeVec256), sys.AMD64)
documentation: !string |-
// NAME selects the low and high 128-bit halves from the 128-bit halves
// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+
+- go: ConcatShiftBytesRight
+ commutative: false
+ documentation: !string |-
+ // NAME concatenates x and y and shift it right by constant bytes.
+ // The result vector will be the lower half of the concatenated vector.
+
+- go: ConcatShiftBytesRightGrouped
+ commutative: false
+ documentation: !string |-
+ // NAME concatenates x and y and shift it right by constant bytes.
+ // The result vector will be the lower half of the concatenated vector.
+ // This operation is performed grouped by each 16 byte.
inVariant: []
out:
- *v
+
+- go: ConcatShiftBytesRight
+ asm: VPALIGNR
+ in:
+ - &uint128
+ go: $t
+ base: uint
+ bits: 128
+ - *uint128
+ - class: immediate
+ immOffset: 0
+ out:
+ - *uint128
+
+- go: ConcatShiftBytesRightGrouped
+ asm: VPALIGNR
+ in:
+ - &uint256512
+ go: $t
+ base: uint
+ bits: 256|512
+ - *uint256512
+ - class: immediate
+ immOffset: 0
+ out:
+ - *uint256512
+
\ No newline at end of file
// Asm: VPCOMPRESSQ, CPU Feature: AVX512
func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
+/* ConcatShiftBytesRight */
+
+// ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes.
+// The result vector will be the lower half of the concatenated vector.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPALIGNR, CPU Feature: AVX
+func (x Uint8x16) ConcatShiftBytesRight(constant uint8, y Uint8x16) Uint8x16
+
+/* ConcatShiftBytesRightGrouped */
+
+// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes.
+// The result vector will be the lower half of the concatenated vector.
+// This operation is performed grouped by each 16 byte.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPALIGNR, CPU Feature: AVX2
+func (x Uint8x32) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x32) Uint8x32
+
+// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes.
+// The result vector will be the lower half of the concatenated vector.
+// This operation is performed grouped by each 16 byte.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPALIGNR, CPU Feature: AVX512
+func (x Uint8x64) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x64) Uint8x64
+
/* ConvertToInt8 */
// ConvertToInt8 converts element values to int8.