From: Junyang Shao Date: Thu, 23 Oct 2025 20:55:57 +0000 (+0000) Subject: [dev.simd] cmd/compile, simd: add VPALIGNR X-Git-Tag: go1.26rc1~147^2~22 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=95871e4a00;p=gostls13.git [dev.simd] cmd/compile, simd: add VPALIGNR This CL named VPALIGNR ConcatShiftBytes[Grouped]. Change-Id: I46c6703085efb0613deefa512de9911b4fdf6bc4 Reviewed-on: https://go-review.googlesource.com/c/go/+/714440 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 9425b42d41..e2d6f6321b 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1113,7 +1113,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSRAQMasked512const: p = simdVkvImm8(s, v) - case ssa.OpAMD64VCMPPS128, + case ssa.OpAMD64VPALIGNR128, + ssa.OpAMD64VPALIGNR256, + ssa.OpAMD64VPALIGNR512, + ssa.OpAMD64VCMPPS128, ssa.OpAMD64VCMPPS256, ssa.OpAMD64VCMPPD128, ssa.OpAMD64VCMPPD256, @@ -1315,6 +1318,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPAVGWMasked128Merging, ssa.OpAMD64VPAVGWMasked256Merging, ssa.OpAMD64VPAVGWMasked512Merging, + ssa.OpAMD64VPALIGNRMasked256Merging, + ssa.OpAMD64VPALIGNRMasked512Merging, + ssa.OpAMD64VPALIGNRMasked128Merging, ssa.OpAMD64VPACKSSDWMasked128Merging, ssa.OpAMD64VPACKSSDWMasked256Merging, ssa.OpAMD64VPACKSSDWMasked512Merging, @@ -1651,7 +1657,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPEXTRW128: p = simdVgpImm8(s, v) - case ssa.OpAMD64VGF2P8AFFINEINVQBMasked128, + case ssa.OpAMD64VPALIGNRMasked256, + ssa.OpAMD64VPALIGNRMasked512, + ssa.OpAMD64VPALIGNRMasked128, + ssa.OpAMD64VGF2P8AFFINEINVQBMasked128, ssa.OpAMD64VGF2P8AFFINEINVQBMasked256, ssa.OpAMD64VGF2P8AFFINEINVQBMasked512, ssa.OpAMD64VGF2P8AFFINEQBMasked128, @@ -2673,6 +2682,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPCOMPRESSQMasked128, ssa.OpAMD64VPCOMPRESSQMasked256, ssa.OpAMD64VPCOMPRESSQMasked512, + ssa.OpAMD64VPALIGNRMasked256, + ssa.OpAMD64VPALIGNRMasked512, + ssa.OpAMD64VPALIGNRMasked128, ssa.OpAMD64VPMOVWBMasked128_128, ssa.OpAMD64VPMOVWBMasked128_256, ssa.OpAMD64VPMOVWBMasked256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 7ba970ca42..4723546b12 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -218,6 +218,9 @@ (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM mask)) (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM mask)) (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM mask)) +(ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...) +(ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...) +(ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...) (ConvertToInt8Int16x8 ...) => (VPMOVWB128_128 ...) (ConvertToInt8Int16x16 ...) => (VPMOVWB128_256 ...) (ConvertToInt8Int16x32 ...) => (VPMOVWB256 ...) @@ -1423,6 +1426,9 @@ (VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask) (VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask) (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask) +(VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask) +(VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask) +(VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask) (VMOVDQU16Masked128 (VPMOVWB128_128 x) mask) => (VPMOVWBMasked128_128 x mask) (VMOVDQU16Masked256 (VPMOVWB128_256 x) mask) => (VPMOVWBMasked128_256 x mask) (VMOVDQU16Masked256 (VPMOVWB256 x) mask) => (VPMOVWBMasked256 x mask) @@ -1894,6 +1900,7 @@ (VPBLENDMBMasked512 dst (VPADDB512 x y) mask) => (VPADDBMasked512Merging dst x y mask) (VPBLENDMBMasked512 dst (VPADDSB512 x y) mask) => (VPADDSBMasked512Merging dst x y mask) (VPBLENDMBMasked512 dst (VPADDUSB512 x y) mask) => (VPADDUSBMasked512Merging dst x y mask) +(VPBLENDMBMasked512 dst (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512Merging dst [a] x y mask) (VPBLENDMBMasked512 dst (VPAVGB512 x y) mask) => (VPAVGBMasked512Merging dst x y mask) (VPBLENDMBMasked512 dst (VPMAXSB512 x y) mask) => (VPMAXSBMasked512Merging dst x y mask) (VPBLENDMBMasked512 dst (VPMAXUB512 x y) mask) => (VPMAXUBMasked512Merging dst x y mask) @@ -2057,6 +2064,7 @@ (VPBLENDVB128 dst (VPADDUSB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSBMasked128Merging dst x y (VPMOVVec8x16ToM mask)) (VPBLENDVB128 dst (VPADDUSW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSWMasked128Merging dst x y (VPMOVVec16x8ToM mask)) (VPBLENDVB128 dst (VPADDW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDWMasked128Merging dst x y (VPMOVVec16x8ToM mask)) +(VPBLENDVB128 dst (VPALIGNR128 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPALIGNRMasked128Merging dst [a] x y (VPMOVVec8x16ToM mask)) (VPBLENDVB128 dst (VPAVGB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGBMasked128Merging dst x y (VPMOVVec8x16ToM mask)) (VPBLENDVB128 dst (VPAVGW128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGWMasked128Merging dst x y (VPMOVVec16x8ToM mask)) (VPBLENDVB128 dst (VPBROADCASTB128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPBROADCASTBMasked128Merging dst x (VPMOVVec8x16ToM mask)) @@ -2227,6 +2235,7 @@ (VPBLENDVB256 dst (VPADDUSB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSBMasked256Merging dst x y (VPMOVVec8x32ToM mask)) (VPBLENDVB256 dst (VPADDUSW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDUSWMasked256Merging dst x y (VPMOVVec16x16ToM mask)) (VPBLENDVB256 dst (VPADDW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPADDWMasked256Merging dst x y (VPMOVVec16x16ToM mask)) +(VPBLENDVB256 dst (VPALIGNR256 [a] x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPALIGNRMasked256Merging dst [a] x y (VPMOVVec8x32ToM mask)) (VPBLENDVB256 dst (VPAVGB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGBMasked256Merging dst x y (VPMOVVec8x32ToM mask)) (VPBLENDVB256 dst (VPAVGW256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPAVGWMasked256Merging dst x y (VPMOVVec16x16ToM mask)) (VPBLENDVB256 dst (VPLZCNTD256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPLZCNTDMasked256Merging dst x (VPMOVVec32x8ToM mask)) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 4e4f4a4205..4f722f8a11 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -1186,6 +1186,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VINSERTF128256", argLength: 2, reg: v21, asm: "VINSERTF128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VINSERTI64X4512", argLength: 2, reg: w21, asm: "VINSERTI64X4", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPALIGNR128", argLength: 2, reg: v21, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPALIGNR256", argLength: 2, reg: v21, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPALIGNR512", argLength: 2, reg: w21, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPALIGNRMasked128", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPALIGNRMasked256", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPALIGNRMasked512", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPBMasked256", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: true, typ: "Mask", resultInArg0: false}, @@ -2343,6 +2349,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VSUBPSMasked128Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VSUBPSMasked256Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VSUBPSMasked512Merging", argLength: 4, reg: w3kw, asm: "VSUBPS", commutative: false, typ: "Vec512", resultInArg0: true}, + {name: "VPALIGNRMasked128Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPALIGNRMasked256Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPALIGNRMasked512Merging", argLength: 4, reg: w3kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPROLDMasked128Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPROLDMasked256Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPROLDMasked512Merging", argLength: 3, reg: w2kw, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 71a4cb3ea8..15608e4fa6 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1122,6 +1122,9 @@ func simdGenericOps() []opData { {name: "CeilScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "CeilScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "CeilScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "ConcatShiftBytesRightGroupedUint8x32", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "ConcatShiftBytesRightGroupedUint8x64", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "ConcatShiftBytesRightUint8x16", argLength: 2, commutative: false, aux: "UInt8"}, {name: "FloorScaledFloat32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "FloorScaledFloat32x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "FloorScaledFloat32x16", argLength: 1, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 1d3875a9be..6bbc29dd12 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2427,6 +2427,12 @@ const ( OpAMD64VINSERTF128256 OpAMD64VINSERTI64X4512 OpAMD64VINSERTI128256 + OpAMD64VPALIGNR128 + OpAMD64VPALIGNR256 + OpAMD64VPALIGNR512 + OpAMD64VPALIGNRMasked128 + OpAMD64VPALIGNRMasked256 + OpAMD64VPALIGNRMasked512 OpAMD64VPCMPB512 OpAMD64VPCMPBMasked128 OpAMD64VPCMPBMasked256 @@ -3584,6 +3590,9 @@ const ( OpAMD64VSUBPSMasked128Merging OpAMD64VSUBPSMasked256Merging OpAMD64VSUBPSMasked512Merging + OpAMD64VPALIGNRMasked128Merging + OpAMD64VPALIGNRMasked256Merging + OpAMD64VPALIGNRMasked512Merging OpAMD64VPROLDMasked128Merging OpAMD64VPROLDMasked256Merging OpAMD64VPROLDMasked512Merging @@ -7057,6 +7066,9 @@ const ( OpCeilScaledResidueFloat64x2 OpCeilScaledResidueFloat64x4 OpCeilScaledResidueFloat64x8 + OpConcatShiftBytesRightGroupedUint8x32 + OpConcatShiftBytesRightGroupedUint8x64 + OpConcatShiftBytesRightUint8x16 OpFloorScaledFloat32x4 OpFloorScaledFloat32x8 OpFloorScaledFloat32x16 @@ -37828,6 +37840,99 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPALIGNR128", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPALIGNR256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPALIGNR512", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPALIGNRMasked128", + auxType: auxUInt8, + argLen: 3, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPALIGNRMasked256", + auxType: auxUInt8, + argLen: 3, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPALIGNRMasked512", + auxType: auxUInt8, + argLen: 3, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPCMPB512", auxType: auxUInt8, @@ -56761,6 +56866,60 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPALIGNRMasked128Merging", + auxType: auxUInt8, + argLen: 4, + resultInArg0: true, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPALIGNRMasked256Merging", + auxType: auxUInt8, + argLen: 4, + resultInArg0: true, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPALIGNRMasked512Merging", + auxType: auxUInt8, + argLen: 4, + resultInArg0: true, + asm: x86.AVPALIGNR, + reg: regInfo{ + inputs: []inputInfo{ + {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPROLDMasked128Merging", auxType: auxUInt8, @@ -91437,6 +91596,24 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "ConcatShiftBytesRightGroupedUint8x32", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "ConcatShiftBytesRightGroupedUint8x64", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "ConcatShiftBytesRightUint8x16", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, { name: "FloorScaledFloat32x4", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 974af9d842..dff3333372 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2548,6 +2548,15 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpCompressUint8x32(v) case OpCompressUint8x64: return rewriteValueAMD64_OpCompressUint8x64(v) + case OpConcatShiftBytesRightGroupedUint8x32: + v.Op = OpAMD64VPALIGNR256 + return true + case OpConcatShiftBytesRightGroupedUint8x64: + v.Op = OpAMD64VPALIGNR512 + return true + case OpConcatShiftBytesRightUint8x16: + v.Op = OpAMD64VPALIGNR128 + return true case OpCondSelect: return rewriteValueAMD64_OpCondSelect(v) case OpConst16: @@ -37487,6 +37496,21 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked128(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) + // result: (VPALIGNRMasked128 [a] x y mask) + for { + if v_0.Op != OpAMD64VPALIGNR128 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPALIGNRMasked128) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } // match: (VMOVDQU8Masked128 (VPMOVSXBW128 x) mask) // result: (VPMOVSXBWMasked128 x mask) for { @@ -37813,6 +37837,21 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked256(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) + // result: (VPALIGNRMasked256 [a] x y mask) + for { + if v_0.Op != OpAMD64VPALIGNR256 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPALIGNRMasked256) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } // match: (VMOVDQU8Masked256 (VPMOVSXBW256 x) mask) // result: (VPMOVSXBWMasked256 x mask) for { @@ -38152,6 +38191,21 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) + // result: (VPALIGNRMasked512 [a] x y mask) + for { + if v_0.Op != OpAMD64VPALIGNR512 { + break + } + a := auxIntToUint8(v_0.AuxInt) + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPALIGNRMasked512) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg3(x, y, mask) + return true + } // match: (VMOVDQU8Masked512 (VPMOVSXBW512 x) mask) // result: (VPMOVSXBWMasked512 x mask) for { @@ -40658,6 +40712,22 @@ func rewriteValueAMD64_OpAMD64VPBLENDMBMasked512(v *Value) bool { v.AddArg4(dst, x, y, mask) return true } + // match: (VPBLENDMBMasked512 dst (VPALIGNR512 [a] x y) mask) + // result: (VPALIGNRMasked512Merging dst [a] x y mask) + for { + dst := v_0 + if v_1.Op != OpAMD64VPALIGNR512 { + break + } + a := auxIntToUint8(v_1.AuxInt) + y := v_1.Args[1] + x := v_1.Args[0] + mask := v_2 + v.reset(OpAMD64VPALIGNRMasked512Merging) + v.AuxInt = uint8ToAuxInt(a) + v.AddArg4(dst, x, y, mask) + return true + } // match: (VPBLENDMBMasked512 dst (VPAVGB512 x y) mask) // result: (VPAVGBMasked512Merging dst x y mask) for { @@ -43185,6 +43255,28 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool { v.AddArg4(dst, x, y, v0) return true } + // match: (VPBLENDVB128 dst (VPALIGNR128 [a] x y) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPALIGNRMasked128Merging dst [a] x y (VPMOVVec8x16ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPALIGNR128 { + break + } + a := auxIntToUint8(v_1.AuxInt) + y := v_1.Args[1] + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPALIGNRMasked128Merging) + v.AuxInt = uint8ToAuxInt(a) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(dst, x, y, v0) + return true + } // match: (VPBLENDVB128 dst (VPAVGB128 x y) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPAVGBMasked128Merging dst x y (VPMOVVec8x16ToM mask)) @@ -46544,6 +46636,28 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool { v.AddArg4(dst, x, y, v0) return true } + // match: (VPBLENDVB256 dst (VPALIGNR256 [a] x y) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPALIGNRMasked256Merging dst [a] x y (VPMOVVec8x32ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPALIGNR256 { + break + } + a := auxIntToUint8(v_1.AuxInt) + y := v_1.Args[1] + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPALIGNRMasked256Merging) + v.AuxInt = uint8ToAuxInt(a) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(dst, x, y, v0) + return true + } // match: (VPBLENDVB256 dst (VPAVGB256 x y) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPAVGBMasked256Merging dst x y (VPMOVVec8x32ToM mask)) diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 710d375ad5..5c941321a4 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -230,6 +230,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Int16x8.ConvertToInt8", opLen1(ssa.OpConvertToInt8Int16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.ConvertToInt8", opLen1(ssa.OpConvertToInt8Int16x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x32.ConvertToInt8", opLen1(ssa.OpConvertToInt8Int16x32, types.TypeVec256), sys.AMD64) diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index b1283f4b6b..bb47819f2f 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -220,3 +220,16 @@ documentation: !string |- // NAME selects the low and high 128-bit halves from the 128-bit halves // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. + +- go: ConcatShiftBytesRight + commutative: false + documentation: !string |- + // NAME concatenates x and y and shift it right by constant bytes. + // The result vector will be the lower half of the concatenated vector. + +- go: ConcatShiftBytesRightGrouped + commutative: false + documentation: !string |- + // NAME concatenates x and y and shift it right by constant bytes. + // The result vector will be the lower half of the concatenated vector. + // This operation is performed grouped by each 16 byte. diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index a1aefd8406..75fbc532b8 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -824,3 +824,30 @@ inVariant: [] out: - *v + +- go: ConcatShiftBytesRight + asm: VPALIGNR + in: + - &uint128 + go: $t + base: uint + bits: 128 + - *uint128 + - class: immediate + immOffset: 0 + out: + - *uint128 + +- go: ConcatShiftBytesRightGrouped + asm: VPALIGNR + in: + - &uint256512 + go: $t + base: uint + bits: 256|512 + - *uint256512 + - class: immediate + immOffset: 0 + out: + - *uint256512 + \ No newline at end of file diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 0f21c8594c..ee472d1163 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -1274,6 +1274,36 @@ func (x Uint64x4) Compress(mask Mask64x4) Uint64x4 // Asm: VPCOMPRESSQ, CPU Feature: AVX512 func (x Uint64x8) Compress(mask Mask64x8) Uint64x8 +/* ConcatShiftBytesRight */ + +// ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes. +// The result vector will be the lower half of the concatenated vector. +// +// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPALIGNR, CPU Feature: AVX +func (x Uint8x16) ConcatShiftBytesRight(constant uint8, y Uint8x16) Uint8x16 + +/* ConcatShiftBytesRightGrouped */ + +// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes. +// The result vector will be the lower half of the concatenated vector. +// This operation is performed grouped by each 16 byte. +// +// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPALIGNR, CPU Feature: AVX2 +func (x Uint8x32) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x32) Uint8x32 + +// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes. +// The result vector will be the lower half of the concatenated vector. +// This operation is performed grouped by each 16 byte. +// +// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPALIGNR, CPU Feature: AVX512 +func (x Uint8x64) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x64) Uint8x64 + /* ConvertToInt8 */ // ConvertToInt8 converts element values to int8.