From bc217d4170f7cb8379386b54462bef62c76b4475 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Thu, 21 Aug 2025 17:45:37 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: add packed saturated u?int conversions This CL should complete the conversions between int and uint. Change-Id: I46742a62214f346e014a68b9c72a9b116a127f67 Reviewed-on: https://go-review.googlesource.com/c/go/+/698236 LUCI-TryBot-Result: Go LUCI Commit-Queue: David Chase Reviewed-by: David Chase --- src/cmd/compile/internal/amd64/simdssa.go | 18 ++ .../compile/internal/ssa/_gen/simdAMD64.rules | 8 + .../compile/internal/ssa/_gen/simdAMD64ops.go | 12 + .../internal/ssa/_gen/simdgenericOps.go | 6 + src/cmd/compile/internal/ssa/opGen.go | 222 ++++++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 44 ++++ .../compile/internal/ssagen/simdintrinsics.go | 6 + .../_gen/simdgen/ops/Converts/categories.yaml | 8 + src/simd/_gen/simdgen/ops/Converts/go.yaml | 21 ++ src/simd/ops_amd64.go | 52 ++++ 10 files changed, 397 insertions(+) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index b12690ca03..e4b0ca7a23 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -200,6 +200,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPAVGW128, ssa.OpAMD64VPAVGW256, ssa.OpAMD64VPAVGW512, + ssa.OpAMD64VPACKSSDW128, + ssa.OpAMD64VPACKSSDW256, + ssa.OpAMD64VPACKSSDW512, + ssa.OpAMD64VPACKUSDW128, + ssa.OpAMD64VPACKUSDW256, + ssa.OpAMD64VPACKUSDW512, ssa.OpAMD64VPSIGNB128, ssa.OpAMD64VPSIGNB256, ssa.OpAMD64VPSIGNW128, @@ -492,6 +498,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPAVGWMasked128, ssa.OpAMD64VPAVGWMasked256, ssa.OpAMD64VPAVGWMasked512, + ssa.OpAMD64VPACKSSDWMasked128, + ssa.OpAMD64VPACKSSDWMasked256, + ssa.OpAMD64VPACKSSDWMasked512, + ssa.OpAMD64VPACKUSDWMasked128, + ssa.OpAMD64VPACKUSDWMasked256, + ssa.OpAMD64VPACKUSDWMasked512, ssa.OpAMD64VDIVPSMasked128, ssa.OpAMD64VDIVPSMasked256, ssa.OpAMD64VDIVPSMasked512, @@ -1437,6 +1449,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVSDWMasked128, ssa.OpAMD64VPMOVSDWMasked256, ssa.OpAMD64VPMOVSQWMasked128, + ssa.OpAMD64VPACKSSDWMasked128, + ssa.OpAMD64VPACKSSDWMasked256, + ssa.OpAMD64VPACKSSDWMasked512, ssa.OpAMD64VPMOVSXBWMasked128, ssa.OpAMD64VCVTTPS2DQMasked128, ssa.OpAMD64VCVTTPS2DQMasked256, @@ -1468,6 +1483,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVUSDWMasked128, ssa.OpAMD64VPMOVUSDWMasked256, ssa.OpAMD64VPMOVUSQWMasked128, + ssa.OpAMD64VPACKUSDWMasked128, + ssa.OpAMD64VPACKUSDWMasked256, + ssa.OpAMD64VPACKUSDWMasked512, ssa.OpAMD64VPMOVZXBWMasked128, ssa.OpAMD64VCVTPS2UDQMasked128, ssa.OpAMD64VCVTPS2UDQMasked256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 372b5a79f6..c6dd5a38ce 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -243,6 +243,9 @@ (ConvertToInt16SaturatedInt64x2 ...) => (VPMOVSQW128 ...) (ConvertToInt16SaturatedInt64x4 ...) => (VPMOVSQW128 ...) (ConvertToInt16SaturatedInt64x8 ...) => (VPMOVSQW128 ...) +(ConvertToInt16SaturatedPackedInt32x4 ...) => (VPACKSSDW128 ...) +(ConvertToInt16SaturatedPackedInt32x8 ...) => (VPACKSSDW256 ...) +(ConvertToInt16SaturatedPackedInt32x16 ...) => (VPACKSSDW512 ...) (ConvertToInt16x8Int8x16 ...) => (VPMOVSXBW128 ...) (ConvertToInt32Float32x4 ...) => (VCVTTPS2DQ128 ...) (ConvertToInt32Float32x8 ...) => (VCVTTPS2DQ256 ...) @@ -299,6 +302,9 @@ (ConvertToUint16SaturatedUint64x2 ...) => (VPMOVUSQW128 ...) (ConvertToUint16SaturatedUint64x4 ...) => (VPMOVUSQW128 ...) (ConvertToUint16SaturatedUint64x8 ...) => (VPMOVUSQW128 ...) +(ConvertToUint16SaturatedPackedUint32x4 ...) => (VPACKUSDW128 ...) +(ConvertToUint16SaturatedPackedUint32x8 ...) => (VPACKUSDW256 ...) +(ConvertToUint16SaturatedPackedUint32x16 ...) => (VPACKUSDW512 ...) (ConvertToUint16x8Uint8x16 ...) => (VPMOVZXBW128 ...) (ConvertToUint32Float32x4 ...) => (VCVTPS2UDQ128 ...) (ConvertToUint32Float32x8 ...) => (VCVTPS2UDQ256 ...) @@ -1244,6 +1250,7 @@ (VMOVDQU32Masked512 (VREDUCEPS512 [a] x) mask) => (VREDUCEPSMasked512 [a] x mask) (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask) (VMOVDQU8Masked512 (VPMOVSXBW512 x) mask) => (VPMOVSXBWMasked512 x mask) +(VMOVDQU32Masked512 (VPACKSSDW512 x y) mask) => (VPACKSSDWMasked512 x y mask) (VMOVDQU32Masked512 (VCVTTPS2DQ512 x) mask) => (VCVTTPS2DQMasked512 x mask) (VMOVDQU8Masked512 (VPMOVSXBD512 x) mask) => (VPMOVSXBDMasked512 x mask) (VMOVDQU16Masked512 (VPMOVSXWD512 x) mask) => (VPMOVSXWDMasked512 x mask) @@ -1251,6 +1258,7 @@ (VMOVDQU32Masked512 (VPMOVSXDQ512 x) mask) => (VPMOVSXDQMasked512 x mask) (VMOVDQU8Masked512 (VPMOVSXBQ512 x) mask) => (VPMOVSXBQMasked512 x mask) (VMOVDQU8Masked512 (VPMOVZXBW512 x) mask) => (VPMOVZXBWMasked512 x mask) +(VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) => (VPACKUSDWMasked512 x y mask) (VMOVDQU32Masked512 (VCVTPS2UDQ512 x) mask) => (VCVTPS2UDQMasked512 x mask) (VMOVDQU8Masked512 (VPMOVZXBD512 x) mask) => (VPMOVZXBDMasked512 x mask) (VMOVDQU16Masked512 (VPMOVZXWD512 x) mask) => (VPMOVZXWDMasked512 x mask) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 773cb2063a..c4ef39a30e 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -182,6 +182,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPABSWMasked128", argLength: 2, reg: wkw, asm: "VPABSW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPABSWMasked256", argLength: 2, reg: wkw, asm: "VPABSW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPABSWMasked512", argLength: 2, reg: wkw, asm: "VPABSW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPACKSSDW128", argLength: 2, reg: v21, asm: "VPACKSSDW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPACKSSDW256", argLength: 2, reg: v21, asm: "VPACKSSDW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPACKSSDW512", argLength: 2, reg: w21, asm: "VPACKSSDW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPACKSSDWMasked128", argLength: 3, reg: w2kw, asm: "VPACKSSDW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPACKSSDWMasked256", argLength: 3, reg: w2kw, asm: "VPACKSSDW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPACKSSDWMasked512", argLength: 3, reg: w2kw, asm: "VPACKSSDW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPACKUSDW128", argLength: 2, reg: v21, asm: "VPACKUSDW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPACKUSDW256", argLength: 2, reg: v21, asm: "VPACKUSDW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPACKUSDW512", argLength: 2, reg: w21, asm: "VPACKUSDW", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPACKUSDWMasked128", argLength: 3, reg: w2kw, asm: "VPACKUSDW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPACKUSDWMasked256", argLength: 3, reg: w2kw, asm: "VPACKUSDW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPACKUSDWMasked512", argLength: 3, reg: w2kw, asm: "VPACKUSDW", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPADDB128", argLength: 2, reg: v21, asm: "VPADDB", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPADDB256", argLength: 2, reg: v21, asm: "VPADDB", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPADDB512", argLength: 2, reg: w21, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 08dbf85771..498c693e3c 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -235,6 +235,9 @@ func simdGenericOps() []opData { {name: "ConvertToInt16SaturatedInt64x2", argLength: 1, commutative: false}, {name: "ConvertToInt16SaturatedInt64x4", argLength: 1, commutative: false}, {name: "ConvertToInt16SaturatedInt64x8", argLength: 1, commutative: false}, + {name: "ConvertToInt16SaturatedPackedInt32x4", argLength: 2, commutative: false}, + {name: "ConvertToInt16SaturatedPackedInt32x8", argLength: 2, commutative: false}, + {name: "ConvertToInt16SaturatedPackedInt32x16", argLength: 2, commutative: false}, {name: "ConvertToInt16x8Int8x16", argLength: 1, commutative: false}, {name: "ConvertToInt32Float32x4", argLength: 1, commutative: false}, {name: "ConvertToInt32Float32x8", argLength: 1, commutative: false}, @@ -277,6 +280,9 @@ func simdGenericOps() []opData { {name: "ConvertToUint8Uint64x2", argLength: 1, commutative: false}, {name: "ConvertToUint8Uint64x4", argLength: 1, commutative: false}, {name: "ConvertToUint8Uint64x8", argLength: 1, commutative: false}, + {name: "ConvertToUint16SaturatedPackedUint32x4", argLength: 2, commutative: false}, + {name: "ConvertToUint16SaturatedPackedUint32x8", argLength: 2, commutative: false}, + {name: "ConvertToUint16SaturatedPackedUint32x16", argLength: 2, commutative: false}, {name: "ConvertToUint16SaturatedUint32x4", argLength: 1, commutative: false}, {name: "ConvertToUint16SaturatedUint32x8", argLength: 1, commutative: false}, {name: "ConvertToUint16SaturatedUint32x16", argLength: 1, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index aefe6a88da..7249752130 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1405,6 +1405,18 @@ const ( OpAMD64VPABSWMasked128 OpAMD64VPABSWMasked256 OpAMD64VPABSWMasked512 + OpAMD64VPACKSSDW128 + OpAMD64VPACKSSDW256 + OpAMD64VPACKSSDW512 + OpAMD64VPACKSSDWMasked128 + OpAMD64VPACKSSDWMasked256 + OpAMD64VPACKSSDWMasked512 + OpAMD64VPACKUSDW128 + OpAMD64VPACKUSDW256 + OpAMD64VPACKUSDW512 + OpAMD64VPACKUSDWMasked128 + OpAMD64VPACKUSDWMasked256 + OpAMD64VPACKUSDWMasked512 OpAMD64VPADDB128 OpAMD64VPADDB256 OpAMD64VPADDB512 @@ -5002,6 +5014,9 @@ const ( OpConvertToInt16SaturatedInt64x2 OpConvertToInt16SaturatedInt64x4 OpConvertToInt16SaturatedInt64x8 + OpConvertToInt16SaturatedPackedInt32x4 + OpConvertToInt16SaturatedPackedInt32x8 + OpConvertToInt16SaturatedPackedInt32x16 OpConvertToInt16x8Int8x16 OpConvertToInt32Float32x4 OpConvertToInt32Float32x8 @@ -5044,6 +5059,9 @@ const ( OpConvertToUint8Uint64x2 OpConvertToUint8Uint64x4 OpConvertToUint8Uint64x8 + OpConvertToUint16SaturatedPackedUint32x4 + OpConvertToUint16SaturatedPackedUint32x8 + OpConvertToUint16SaturatedPackedUint32x16 OpConvertToUint16SaturatedUint32x4 OpConvertToUint16SaturatedUint32x8 OpConvertToUint16SaturatedUint32x16 @@ -21608,6 +21626,180 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPACKSSDW128", + argLen: 2, + asm: x86.AVPACKSSDW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKSSDW256", + argLen: 2, + asm: x86.AVPACKSSDW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKSSDW512", + argLen: 2, + asm: x86.AVPACKSSDW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPACKSSDWMasked128", + argLen: 3, + asm: x86.AVPACKSSDW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKSSDWMasked256", + argLen: 3, + asm: x86.AVPACKSSDW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKSSDWMasked512", + argLen: 3, + asm: x86.AVPACKSSDW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKUSDW128", + argLen: 2, + asm: x86.AVPACKUSDW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKUSDW256", + argLen: 2, + asm: x86.AVPACKUSDW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKUSDW512", + argLen: 2, + asm: x86.AVPACKUSDW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPACKUSDWMasked128", + argLen: 3, + asm: x86.AVPACKUSDW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKUSDWMasked256", + argLen: 3, + asm: x86.AVPACKUSDW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPACKUSDWMasked512", + argLen: 3, + asm: x86.AVPACKUSDW, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPADDB128", argLen: 2, @@ -66238,6 +66430,21 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "ConvertToInt16SaturatedPackedInt32x4", + argLen: 2, + generic: true, + }, + { + name: "ConvertToInt16SaturatedPackedInt32x8", + argLen: 2, + generic: true, + }, + { + name: "ConvertToInt16SaturatedPackedInt32x16", + argLen: 2, + generic: true, + }, { name: "ConvertToInt16x8Int8x16", argLen: 1, @@ -66448,6 +66655,21 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "ConvertToUint16SaturatedPackedUint32x4", + argLen: 2, + generic: true, + }, + { + name: "ConvertToUint16SaturatedPackedUint32x8", + argLen: 2, + generic: true, + }, + { + name: "ConvertToUint16SaturatedPackedUint32x16", + argLen: 2, + generic: true, + }, { name: "ConvertToUint16SaturatedUint32x4", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 53afacebf8..fea6b047d1 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1412,6 +1412,15 @@ func rewriteValueAMD64(v *Value) bool { case OpConvertToInt16SaturatedInt64x8: v.Op = OpAMD64VPMOVSQW128 return true + case OpConvertToInt16SaturatedPackedInt32x16: + v.Op = OpAMD64VPACKSSDW512 + return true + case OpConvertToInt16SaturatedPackedInt32x4: + v.Op = OpAMD64VPACKSSDW128 + return true + case OpConvertToInt16SaturatedPackedInt32x8: + v.Op = OpAMD64VPACKSSDW256 + return true case OpConvertToInt16x8Int8x16: v.Op = OpAMD64VPMOVSXBW128 return true @@ -1538,6 +1547,15 @@ func rewriteValueAMD64(v *Value) bool { case OpConvertToInt8SaturatedInt64x8: v.Op = OpAMD64VPMOVSQB128 return true + case OpConvertToUint16SaturatedPackedUint32x16: + v.Op = OpAMD64VPACKUSDW512 + return true + case OpConvertToUint16SaturatedPackedUint32x4: + v.Op = OpAMD64VPACKUSDW128 + return true + case OpConvertToUint16SaturatedPackedUint32x8: + v.Op = OpAMD64VPACKUSDW256 + return true case OpConvertToUint16SaturatedUint32x16: v.Op = OpAMD64VPMOVUSDW256 return true @@ -27007,6 +27025,19 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU32Masked512 (VPACKSSDW512 x y) mask) + // result: (VPACKSSDWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPACKSSDW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPACKSSDWMasked512) + v.AddArg3(x, y, mask) + return true + } // match: (VMOVDQU32Masked512 (VCVTTPS2DQ512 x) mask) // result: (VCVTTPS2DQMasked512 x mask) for { @@ -27031,6 +27062,19 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) + // result: (VPACKUSDWMasked512 x y mask) + for { + if v_0.Op != OpAMD64VPACKUSDW512 { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPACKUSDWMasked512) + v.AddArg3(x, y, mask) + return true + } // match: (VMOVDQU32Masked512 (VCVTPS2UDQ512 x) mask) // result: (VCVTPS2UDQMasked512 x mask) for { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 2e31fdec19..0bd4a27606 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -255,6 +255,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x2.ConvertToInt16Saturated", opLen1(ssa.OpConvertToInt16SaturatedInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x4.ConvertToInt16Saturated", opLen1(ssa.OpConvertToInt16SaturatedInt64x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x8.ConvertToInt16Saturated", opLen1(ssa.OpConvertToInt16SaturatedInt64x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x4.ConvertToInt16SaturatedPacked", opLen2(ssa.OpConvertToInt16SaturatedPackedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.ConvertToInt16SaturatedPacked", opLen2(ssa.OpConvertToInt16SaturatedPackedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.ConvertToInt16SaturatedPacked", opLen2(ssa.OpConvertToInt16SaturatedPackedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.ConvertToInt16x8", opLen1(ssa.OpConvertToInt16x8Int8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.ConvertToInt32", opLen1(ssa.OpConvertToInt32Float32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.ConvertToInt32", opLen1(ssa.OpConvertToInt32Float32x8, types.TypeVec256), sys.AMD64) @@ -311,6 +314,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.ConvertToUint16Saturated", opLen1(ssa.OpConvertToUint16SaturatedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.ConvertToUint16Saturated", opLen1(ssa.OpConvertToUint16SaturatedUint64x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x8.ConvertToUint16Saturated", opLen1(ssa.OpConvertToUint16SaturatedUint64x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x4.ConvertToUint16SaturatedPacked", opLen2(ssa.OpConvertToUint16SaturatedPackedUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.ConvertToUint16SaturatedPacked", opLen2(ssa.OpConvertToUint16SaturatedPackedUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.ConvertToUint16SaturatedPacked", opLen2(ssa.OpConvertToUint16SaturatedPackedUint32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.ConvertToUint16x8", opLen1(ssa.OpConvertToUint16x8Uint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.ConvertToUint32", opLen1(ssa.OpConvertToUint32Float32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.ConvertToUint32", opLen1(ssa.OpConvertToUint32Float32x8, types.TypeVec256), sys.AMD64) diff --git a/src/simd/_gen/simdgen/ops/Converts/categories.yaml b/src/simd/_gen/simdgen/ops/Converts/categories.yaml index 38e320b3d9..9f02960862 100644 --- a/src/simd/_gen/simdgen/ops/Converts/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Converts/categories.yaml @@ -57,6 +57,14 @@ commutative: false documentation: !string |- // NAME converts element values to uint32 with saturation. +- go: ConvertToInt16SaturatedPacked + commutative: false + documentation: !string |- + // NAME converts element values to int16 with saturation. +- go: ConvertToUint16SaturatedPacked + commutative: false + documentation: !string |- + // NAME converts element values to uint16 with saturation. # low-part only conversions # int<->int or uint<->uint widening conversions. diff --git a/src/simd/_gen/simdgen/ops/Converts/go.yaml b/src/simd/_gen/simdgen/ops/Converts/go.yaml index b4eb1eb122..a82ae377dd 100644 --- a/src/simd/_gen/simdgen/ops/Converts/go.yaml +++ b/src/simd/_gen/simdgen/ops/Converts/go.yaml @@ -280,6 +280,27 @@ - base: uint out: - base: uint +# Truncating saturated packed +- go: ConvertToInt16SaturatedPacked + asm: "VPACKSSDW" + addDoc: &satDocPacked + !string |- + // With each 128-bit as a group: + // The converted group from the first input vector will be packed to the lower part of the result vector, + // the converted group from the second second input vector will be packed to the upper part of the result vector. + in: + - base: int + - base: int + out: + - base: int +- go: ConvertToUint16SaturatedPacked + asm: "VPACKUSDW" + addDoc: *satDocPacked + in: + - base: uint + - base: uint + out: + - base: uint # low-part only conversions. # uint8->uint16 diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index ba46b88027..7366aabd32 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -1408,6 +1408,32 @@ func (x Int64x4) ConvertToInt16Saturated() Int16x8 // Asm: VPMOVSQW, CPU Feature: AVX512 func (x Int64x8) ConvertToInt16Saturated() Int16x8 +/* ConvertToInt16SaturatedPacked */ + +// ConvertToInt16SaturatedPacked converts element values to int16 with saturation. +// With each 128-bit as a group: +// The converted group from the first input vector will be packed to the lower part of the result vector, +// the converted group from the second second input vector will be packed to the upper part of the result vector. +// +// Asm: VPACKSSDW, CPU Feature: AVX +func (x Int32x4) ConvertToInt16SaturatedPacked(y Int32x4) Int16x8 + +// ConvertToInt16SaturatedPacked converts element values to int16 with saturation. +// With each 128-bit as a group: +// The converted group from the first input vector will be packed to the lower part of the result vector, +// the converted group from the second second input vector will be packed to the upper part of the result vector. +// +// Asm: VPACKSSDW, CPU Feature: AVX2 +func (x Int32x8) ConvertToInt16SaturatedPacked(y Int32x8) Int16x16 + +// ConvertToInt16SaturatedPacked converts element values to int16 with saturation. +// With each 128-bit as a group: +// The converted group from the first input vector will be packed to the lower part of the result vector, +// the converted group from the second second input vector will be packed to the upper part of the result vector. +// +// Asm: VPACKSSDW, CPU Feature: AVX512 +func (x Int32x16) ConvertToInt16SaturatedPacked(y Int32x16) Int16x32 + /* ConvertToInt16x8 */ // ConvertToInt16x8 converts 8 lowest vector element values to int16. @@ -1768,6 +1794,32 @@ func (x Uint64x4) ConvertToUint16Saturated() Uint16x8 // Asm: VPMOVUSQW, CPU Feature: AVX512 func (x Uint64x8) ConvertToUint16Saturated() Uint16x8 +/* ConvertToUint16SaturatedPacked */ + +// ConvertToUint16SaturatedPacked converts element values to uint16 with saturation. +// With each 128-bit as a group: +// The converted group from the first input vector will be packed to the lower part of the result vector, +// the converted group from the second second input vector will be packed to the upper part of the result vector. +// +// Asm: VPACKUSDW, CPU Feature: AVX +func (x Uint32x4) ConvertToUint16SaturatedPacked(y Uint32x4) Uint16x8 + +// ConvertToUint16SaturatedPacked converts element values to uint16 with saturation. +// With each 128-bit as a group: +// The converted group from the first input vector will be packed to the lower part of the result vector, +// the converted group from the second second input vector will be packed to the upper part of the result vector. +// +// Asm: VPACKUSDW, CPU Feature: AVX2 +func (x Uint32x8) ConvertToUint16SaturatedPacked(y Uint32x8) Uint16x16 + +// ConvertToUint16SaturatedPacked converts element values to uint16 with saturation. +// With each 128-bit as a group: +// The converted group from the first input vector will be packed to the lower part of the result vector, +// the converted group from the second second input vector will be packed to the upper part of the result vector. +// +// Asm: VPACKUSDW, CPU Feature: AVX512 +func (x Uint32x16) ConvertToUint16SaturatedPacked(y Uint32x16) Uint16x32 + /* ConvertToUint16x8 */ // ConvertToUint16x8 converts 8 lowest vector element values to uint16. -- 2.52.0