]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile, simd: add packed saturated u?int conversions
authorJunyang Shao <shaojunyang@google.com>
Thu, 21 Aug 2025 17:45:37 +0000 (17:45 +0000)
committerJunyang Shao <shaojunyang@google.com>
Fri, 22 Aug 2025 16:10:18 +0000 (09:10 -0700)
This CL should complete the conversions between int and uint.

Change-Id: I46742a62214f346e014a68b9c72a9b116a127f67
Reviewed-on: https://go-review.googlesource.com/c/go/+/698236
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Commit-Queue: David Chase <drchase@google.com>
Reviewed-by: David Chase <drchase@google.com>
src/cmd/compile/internal/amd64/simdssa.go
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/cmd/compile/internal/ssagen/simdintrinsics.go
src/simd/_gen/simdgen/ops/Converts/categories.yaml
src/simd/_gen/simdgen/ops/Converts/go.yaml
src/simd/ops_amd64.go

index b12690ca03bf58b873468e525250f0a9ed094c85..e4b0ca7a231c4e442898d613e2b7b7f54d9abb82 100644 (file)
@@ -200,6 +200,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPAVGW128,
                ssa.OpAMD64VPAVGW256,
                ssa.OpAMD64VPAVGW512,
+               ssa.OpAMD64VPACKSSDW128,
+               ssa.OpAMD64VPACKSSDW256,
+               ssa.OpAMD64VPACKSSDW512,
+               ssa.OpAMD64VPACKUSDW128,
+               ssa.OpAMD64VPACKUSDW256,
+               ssa.OpAMD64VPACKUSDW512,
                ssa.OpAMD64VPSIGNB128,
                ssa.OpAMD64VPSIGNB256,
                ssa.OpAMD64VPSIGNW128,
@@ -492,6 +498,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPAVGWMasked128,
                ssa.OpAMD64VPAVGWMasked256,
                ssa.OpAMD64VPAVGWMasked512,
+               ssa.OpAMD64VPACKSSDWMasked128,
+               ssa.OpAMD64VPACKSSDWMasked256,
+               ssa.OpAMD64VPACKSSDWMasked512,
+               ssa.OpAMD64VPACKUSDWMasked128,
+               ssa.OpAMD64VPACKUSDWMasked256,
+               ssa.OpAMD64VPACKUSDWMasked512,
                ssa.OpAMD64VDIVPSMasked128,
                ssa.OpAMD64VDIVPSMasked256,
                ssa.OpAMD64VDIVPSMasked512,
@@ -1437,6 +1449,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPMOVSDWMasked128,
                ssa.OpAMD64VPMOVSDWMasked256,
                ssa.OpAMD64VPMOVSQWMasked128,
+               ssa.OpAMD64VPACKSSDWMasked128,
+               ssa.OpAMD64VPACKSSDWMasked256,
+               ssa.OpAMD64VPACKSSDWMasked512,
                ssa.OpAMD64VPMOVSXBWMasked128,
                ssa.OpAMD64VCVTTPS2DQMasked128,
                ssa.OpAMD64VCVTTPS2DQMasked256,
@@ -1468,6 +1483,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPMOVUSDWMasked128,
                ssa.OpAMD64VPMOVUSDWMasked256,
                ssa.OpAMD64VPMOVUSQWMasked128,
+               ssa.OpAMD64VPACKUSDWMasked128,
+               ssa.OpAMD64VPACKUSDWMasked256,
+               ssa.OpAMD64VPACKUSDWMasked512,
                ssa.OpAMD64VPMOVZXBWMasked128,
                ssa.OpAMD64VCVTPS2UDQMasked128,
                ssa.OpAMD64VCVTPS2UDQMasked256,
index 372b5a79f6898beaa416367af601182d1bc7affa..c6dd5a38ce71e89b7e575426334670c8f1d63fd0 100644 (file)
 (ConvertToInt16SaturatedInt64x2 ...) => (VPMOVSQW128 ...)
 (ConvertToInt16SaturatedInt64x4 ...) => (VPMOVSQW128 ...)
 (ConvertToInt16SaturatedInt64x8 ...) => (VPMOVSQW128 ...)
+(ConvertToInt16SaturatedPackedInt32x4 ...) => (VPACKSSDW128 ...)
+(ConvertToInt16SaturatedPackedInt32x8 ...) => (VPACKSSDW256 ...)
+(ConvertToInt16SaturatedPackedInt32x16 ...) => (VPACKSSDW512 ...)
 (ConvertToInt16x8Int8x16 ...) => (VPMOVSXBW128 ...)
 (ConvertToInt32Float32x4 ...) => (VCVTTPS2DQ128 ...)
 (ConvertToInt32Float32x8 ...) => (VCVTTPS2DQ256 ...)
 (ConvertToUint16SaturatedUint64x2 ...) => (VPMOVUSQW128 ...)
 (ConvertToUint16SaturatedUint64x4 ...) => (VPMOVUSQW128 ...)
 (ConvertToUint16SaturatedUint64x8 ...) => (VPMOVUSQW128 ...)
+(ConvertToUint16SaturatedPackedUint32x4 ...) => (VPACKUSDW128 ...)
+(ConvertToUint16SaturatedPackedUint32x8 ...) => (VPACKUSDW256 ...)
+(ConvertToUint16SaturatedPackedUint32x16 ...) => (VPACKUSDW512 ...)
 (ConvertToUint16x8Uint8x16 ...) => (VPMOVZXBW128 ...)
 (ConvertToUint32Float32x4 ...) => (VCVTPS2UDQ128 ...)
 (ConvertToUint32Float32x8 ...) => (VCVTPS2UDQ256 ...)
 (VMOVDQU32Masked512 (VREDUCEPS512 [a] x) mask) => (VREDUCEPSMasked512 [a] x mask)
 (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
 (VMOVDQU8Masked512 (VPMOVSXBW512 x) mask) => (VPMOVSXBWMasked512 x mask)
+(VMOVDQU32Masked512 (VPACKSSDW512 x y) mask) => (VPACKSSDWMasked512 x y mask)
 (VMOVDQU32Masked512 (VCVTTPS2DQ512 x) mask) => (VCVTTPS2DQMasked512 x mask)
 (VMOVDQU8Masked512 (VPMOVSXBD512 x) mask) => (VPMOVSXBDMasked512 x mask)
 (VMOVDQU16Masked512 (VPMOVSXWD512 x) mask) => (VPMOVSXWDMasked512 x mask)
 (VMOVDQU32Masked512 (VPMOVSXDQ512 x) mask) => (VPMOVSXDQMasked512 x mask)
 (VMOVDQU8Masked512 (VPMOVSXBQ512 x) mask) => (VPMOVSXBQMasked512 x mask)
 (VMOVDQU8Masked512 (VPMOVZXBW512 x) mask) => (VPMOVZXBWMasked512 x mask)
+(VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) => (VPACKUSDWMasked512 x y mask)
 (VMOVDQU32Masked512 (VCVTPS2UDQ512 x) mask) => (VCVTPS2UDQMasked512 x mask)
 (VMOVDQU8Masked512 (VPMOVZXBD512 x) mask) => (VPMOVZXBDMasked512 x mask)
 (VMOVDQU16Masked512 (VPMOVZXWD512 x) mask) => (VPMOVZXWDMasked512 x mask)
index 773cb2063a6cdb46b477989ed428d74f67255935..c4ef39a30ea4bc2c5977e01f5555cc817801bbec 100644 (file)
@@ -182,6 +182,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                {name: "VPABSWMasked128", argLength: 2, reg: wkw, asm: "VPABSW", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPABSWMasked256", argLength: 2, reg: wkw, asm: "VPABSW", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPABSWMasked512", argLength: 2, reg: wkw, asm: "VPABSW", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPACKSSDW128", argLength: 2, reg: v21, asm: "VPACKSSDW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPACKSSDW256", argLength: 2, reg: v21, asm: "VPACKSSDW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPACKSSDW512", argLength: 2, reg: w21, asm: "VPACKSSDW", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPACKSSDWMasked128", argLength: 3, reg: w2kw, asm: "VPACKSSDW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPACKSSDWMasked256", argLength: 3, reg: w2kw, asm: "VPACKSSDW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPACKSSDWMasked512", argLength: 3, reg: w2kw, asm: "VPACKSSDW", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPACKUSDW128", argLength: 2, reg: v21, asm: "VPACKUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPACKUSDW256", argLength: 2, reg: v21, asm: "VPACKUSDW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPACKUSDW512", argLength: 2, reg: w21, asm: "VPACKUSDW", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPACKUSDWMasked128", argLength: 3, reg: w2kw, asm: "VPACKUSDW", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPACKUSDWMasked256", argLength: 3, reg: w2kw, asm: "VPACKUSDW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPACKUSDWMasked512", argLength: 3, reg: w2kw, asm: "VPACKUSDW", commutative: false, typ: "Vec512", resultInArg0: false},
                {name: "VPADDB128", argLength: 2, reg: v21, asm: "VPADDB", commutative: true, typ: "Vec128", resultInArg0: false},
                {name: "VPADDB256", argLength: 2, reg: v21, asm: "VPADDB", commutative: true, typ: "Vec256", resultInArg0: false},
                {name: "VPADDB512", argLength: 2, reg: w21, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
index 08dbf85771308e79eb25363ce2ea7f0bfde454dd..498c693e3cb1e1e62014e360eaa48d888ee39c5f 100644 (file)
@@ -235,6 +235,9 @@ func simdGenericOps() []opData {
                {name: "ConvertToInt16SaturatedInt64x2", argLength: 1, commutative: false},
                {name: "ConvertToInt16SaturatedInt64x4", argLength: 1, commutative: false},
                {name: "ConvertToInt16SaturatedInt64x8", argLength: 1, commutative: false},
+               {name: "ConvertToInt16SaturatedPackedInt32x4", argLength: 2, commutative: false},
+               {name: "ConvertToInt16SaturatedPackedInt32x8", argLength: 2, commutative: false},
+               {name: "ConvertToInt16SaturatedPackedInt32x16", argLength: 2, commutative: false},
                {name: "ConvertToInt16x8Int8x16", argLength: 1, commutative: false},
                {name: "ConvertToInt32Float32x4", argLength: 1, commutative: false},
                {name: "ConvertToInt32Float32x8", argLength: 1, commutative: false},
@@ -277,6 +280,9 @@ func simdGenericOps() []opData {
                {name: "ConvertToUint8Uint64x2", argLength: 1, commutative: false},
                {name: "ConvertToUint8Uint64x4", argLength: 1, commutative: false},
                {name: "ConvertToUint8Uint64x8", argLength: 1, commutative: false},
+               {name: "ConvertToUint16SaturatedPackedUint32x4", argLength: 2, commutative: false},
+               {name: "ConvertToUint16SaturatedPackedUint32x8", argLength: 2, commutative: false},
+               {name: "ConvertToUint16SaturatedPackedUint32x16", argLength: 2, commutative: false},
                {name: "ConvertToUint16SaturatedUint32x4", argLength: 1, commutative: false},
                {name: "ConvertToUint16SaturatedUint32x8", argLength: 1, commutative: false},
                {name: "ConvertToUint16SaturatedUint32x16", argLength: 1, commutative: false},
index aefe6a88dad77c05b1dd9bda84aee1b98285a8d7..7249752130bba37e3e1ff1877e507b9f6f16a31f 100644 (file)
@@ -1405,6 +1405,18 @@ const (
        OpAMD64VPABSWMasked128
        OpAMD64VPABSWMasked256
        OpAMD64VPABSWMasked512
+       OpAMD64VPACKSSDW128
+       OpAMD64VPACKSSDW256
+       OpAMD64VPACKSSDW512
+       OpAMD64VPACKSSDWMasked128
+       OpAMD64VPACKSSDWMasked256
+       OpAMD64VPACKSSDWMasked512
+       OpAMD64VPACKUSDW128
+       OpAMD64VPACKUSDW256
+       OpAMD64VPACKUSDW512
+       OpAMD64VPACKUSDWMasked128
+       OpAMD64VPACKUSDWMasked256
+       OpAMD64VPACKUSDWMasked512
        OpAMD64VPADDB128
        OpAMD64VPADDB256
        OpAMD64VPADDB512
@@ -5002,6 +5014,9 @@ const (
        OpConvertToInt16SaturatedInt64x2
        OpConvertToInt16SaturatedInt64x4
        OpConvertToInt16SaturatedInt64x8
+       OpConvertToInt16SaturatedPackedInt32x4
+       OpConvertToInt16SaturatedPackedInt32x8
+       OpConvertToInt16SaturatedPackedInt32x16
        OpConvertToInt16x8Int8x16
        OpConvertToInt32Float32x4
        OpConvertToInt32Float32x8
@@ -5044,6 +5059,9 @@ const (
        OpConvertToUint8Uint64x2
        OpConvertToUint8Uint64x4
        OpConvertToUint8Uint64x8
+       OpConvertToUint16SaturatedPackedUint32x4
+       OpConvertToUint16SaturatedPackedUint32x8
+       OpConvertToUint16SaturatedPackedUint32x16
        OpConvertToUint16SaturatedUint32x4
        OpConvertToUint16SaturatedUint32x8
        OpConvertToUint16SaturatedUint32x16
@@ -21608,6 +21626,180 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "VPACKSSDW128",
+               argLen: 2,
+               asm:    x86.AVPACKSSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKSSDW256",
+               argLen: 2,
+               asm:    x86.AVPACKSSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKSSDW512",
+               argLen: 2,
+               asm:    x86.AVPACKSSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPACKSSDWMasked128",
+               argLen: 3,
+               asm:    x86.AVPACKSSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKSSDWMasked256",
+               argLen: 3,
+               asm:    x86.AVPACKSSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKSSDWMasked512",
+               argLen: 3,
+               asm:    x86.AVPACKSSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKUSDW128",
+               argLen: 2,
+               asm:    x86.AVPACKUSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKUSDW256",
+               argLen: 2,
+               asm:    x86.AVPACKUSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKUSDW512",
+               argLen: 2,
+               asm:    x86.AVPACKUSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                               {1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPACKUSDWMasked128",
+               argLen: 3,
+               asm:    x86.AVPACKUSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKUSDWMasked256",
+               argLen: 3,
+               asm:    x86.AVPACKUSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
+       {
+               name:   "VPACKUSDWMasked512",
+               argLen: 3,
+               asm:    x86.AVPACKUSDW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                               {1, 2147418112},        // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+                       outputs: []outputInfo{
+                               {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+                       },
+               },
+       },
        {
                name:        "VPADDB128",
                argLen:      2,
@@ -66238,6 +66430,21 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "ConvertToInt16SaturatedPackedInt32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ConvertToInt16SaturatedPackedInt32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ConvertToInt16SaturatedPackedInt32x16",
+               argLen:  2,
+               generic: true,
+       },
        {
                name:    "ConvertToInt16x8Int8x16",
                argLen:  1,
@@ -66448,6 +66655,21 @@ var opcodeTable = [...]opInfo{
                argLen:  1,
                generic: true,
        },
+       {
+               name:    "ConvertToUint16SaturatedPackedUint32x4",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ConvertToUint16SaturatedPackedUint32x8",
+               argLen:  2,
+               generic: true,
+       },
+       {
+               name:    "ConvertToUint16SaturatedPackedUint32x16",
+               argLen:  2,
+               generic: true,
+       },
        {
                name:    "ConvertToUint16SaturatedUint32x4",
                argLen:  1,
index 53afacebf8cbd4073d415f95a192b69141c8110f..fea6b047d1319504e802f1f754ccc11b1150a687 100644 (file)
@@ -1412,6 +1412,15 @@ func rewriteValueAMD64(v *Value) bool {
        case OpConvertToInt16SaturatedInt64x8:
                v.Op = OpAMD64VPMOVSQW128
                return true
+       case OpConvertToInt16SaturatedPackedInt32x16:
+               v.Op = OpAMD64VPACKSSDW512
+               return true
+       case OpConvertToInt16SaturatedPackedInt32x4:
+               v.Op = OpAMD64VPACKSSDW128
+               return true
+       case OpConvertToInt16SaturatedPackedInt32x8:
+               v.Op = OpAMD64VPACKSSDW256
+               return true
        case OpConvertToInt16x8Int8x16:
                v.Op = OpAMD64VPMOVSXBW128
                return true
@@ -1538,6 +1547,15 @@ func rewriteValueAMD64(v *Value) bool {
        case OpConvertToInt8SaturatedInt64x8:
                v.Op = OpAMD64VPMOVSQB128
                return true
+       case OpConvertToUint16SaturatedPackedUint32x16:
+               v.Op = OpAMD64VPACKUSDW512
+               return true
+       case OpConvertToUint16SaturatedPackedUint32x4:
+               v.Op = OpAMD64VPACKUSDW128
+               return true
+       case OpConvertToUint16SaturatedPackedUint32x8:
+               v.Op = OpAMD64VPACKUSDW256
+               return true
        case OpConvertToUint16SaturatedUint32x16:
                v.Op = OpAMD64VPMOVUSDW256
                return true
@@ -27007,6 +27025,19 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
                v.AddArg2(x, mask)
                return true
        }
+       // match: (VMOVDQU32Masked512 (VPACKSSDW512 x y) mask)
+       // result: (VPACKSSDWMasked512 x y mask)
+       for {
+               if v_0.Op != OpAMD64VPACKSSDW512 {
+                       break
+               }
+               y := v_0.Args[1]
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPACKSSDWMasked512)
+               v.AddArg3(x, y, mask)
+               return true
+       }
        // match: (VMOVDQU32Masked512 (VCVTTPS2DQ512 x) mask)
        // result: (VCVTTPS2DQMasked512 x mask)
        for {
@@ -27031,6 +27062,19 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
                v.AddArg2(x, mask)
                return true
        }
+       // match: (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask)
+       // result: (VPACKUSDWMasked512 x y mask)
+       for {
+               if v_0.Op != OpAMD64VPACKUSDW512 {
+                       break
+               }
+               y := v_0.Args[1]
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPACKUSDWMasked512)
+               v.AddArg3(x, y, mask)
+               return true
+       }
        // match: (VMOVDQU32Masked512 (VCVTPS2UDQ512 x) mask)
        // result: (VCVTPS2UDQMasked512 x mask)
        for {
index 2e31fdec19b18013d3199fdbbe794e9303de0e71..0bd4a27606b512985053c19d296db009da44e0ff 100644 (file)
@@ -255,6 +255,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Int64x2.ConvertToInt16Saturated", opLen1(ssa.OpConvertToInt16SaturatedInt64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int64x4.ConvertToInt16Saturated", opLen1(ssa.OpConvertToInt16SaturatedInt64x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Int64x8.ConvertToInt16Saturated", opLen1(ssa.OpConvertToInt16SaturatedInt64x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x4.ConvertToInt16SaturatedPacked", opLen2(ssa.OpConvertToInt16SaturatedPackedInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.ConvertToInt16SaturatedPacked", opLen2(ssa.OpConvertToInt16SaturatedPackedInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x16.ConvertToInt16SaturatedPacked", opLen2(ssa.OpConvertToInt16SaturatedPackedInt32x16, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Int8x16.ConvertToInt16x8", opLen1(ssa.OpConvertToInt16x8Int8x16, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float32x4.ConvertToInt32", opLen1(ssa.OpConvertToInt32Float32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float32x8.ConvertToInt32", opLen1(ssa.OpConvertToInt32Float32x8, types.TypeVec256), sys.AMD64)
@@ -311,6 +314,9 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Uint64x2.ConvertToUint16Saturated", opLen1(ssa.OpConvertToUint16SaturatedUint64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint64x4.ConvertToUint16Saturated", opLen1(ssa.OpConvertToUint16SaturatedUint64x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Uint64x8.ConvertToUint16Saturated", opLen1(ssa.OpConvertToUint16SaturatedUint64x8, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x4.ConvertToUint16SaturatedPacked", opLen2(ssa.OpConvertToUint16SaturatedPackedUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x8.ConvertToUint16SaturatedPacked", opLen2(ssa.OpConvertToUint16SaturatedPackedUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x16.ConvertToUint16SaturatedPacked", opLen2(ssa.OpConvertToUint16SaturatedPackedUint32x16, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Uint8x16.ConvertToUint16x8", opLen1(ssa.OpConvertToUint16x8Uint8x16, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float32x4.ConvertToUint32", opLen1(ssa.OpConvertToUint32Float32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float32x8.ConvertToUint32", opLen1(ssa.OpConvertToUint32Float32x8, types.TypeVec256), sys.AMD64)
index 38e320b3d9686c76ceab51a7648a032e2946c3f9..9f02960862a8e3781f9f7f06ef9b6b80e25936c9 100644 (file)
   commutative: false
   documentation: !string |-
     // NAME converts element values to uint32 with saturation.
+- go: ConvertToInt16SaturatedPacked
+  commutative: false
+  documentation: !string |-
+    // NAME converts element values to int16 with saturation.
+- go: ConvertToUint16SaturatedPacked
+  commutative: false
+  documentation: !string |-
+    // NAME converts element values to uint16 with saturation.
 
 # low-part only conversions
 # int<->int or uint<->uint widening conversions.
index b4eb1eb1221d6ddd0185b1a5b89215909f97fccb..a82ae377dd65a5e77def1abf17821a836cf3175f 100644 (file)
     - base: uint
   out:
     - base: uint
+# Truncating saturated packed
+- go: ConvertToInt16SaturatedPacked
+  asm: "VPACKSSDW"
+  addDoc: &satDocPacked
+    !string |-
+    // With each 128-bit as a group:
+    // The converted group from the first input vector will be packed to the lower part of the result vector,
+    // the converted group from the second second input vector will be packed to the upper part of the result vector.
+  in:
+    - base: int
+    - base: int
+  out:
+    - base: int
+- go: ConvertToUint16SaturatedPacked
+  asm: "VPACKUSDW"
+  addDoc: *satDocPacked
+  in:
+    - base: uint
+    - base: uint
+  out:
+    - base: uint
 
 # low-part only conversions.
 # uint8->uint16
index ba46b8802788f4ec3fd9ef2744b2e96ad530047b..7366aabd3260ae9b931bae68cb5ed1aaddb8b50b 100644 (file)
@@ -1408,6 +1408,32 @@ func (x Int64x4) ConvertToInt16Saturated() Int16x8
 // Asm: VPMOVSQW, CPU Feature: AVX512
 func (x Int64x8) ConvertToInt16Saturated() Int16x8
 
+/* ConvertToInt16SaturatedPacked */
+
+// ConvertToInt16SaturatedPacked converts element values to int16 with saturation.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second second input vector will be packed to the upper part of the result vector.
+//
+// Asm: VPACKSSDW, CPU Feature: AVX
+func (x Int32x4) ConvertToInt16SaturatedPacked(y Int32x4) Int16x8
+
+// ConvertToInt16SaturatedPacked converts element values to int16 with saturation.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second second input vector will be packed to the upper part of the result vector.
+//
+// Asm: VPACKSSDW, CPU Feature: AVX2
+func (x Int32x8) ConvertToInt16SaturatedPacked(y Int32x8) Int16x16
+
+// ConvertToInt16SaturatedPacked converts element values to int16 with saturation.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second second input vector will be packed to the upper part of the result vector.
+//
+// Asm: VPACKSSDW, CPU Feature: AVX512
+func (x Int32x16) ConvertToInt16SaturatedPacked(y Int32x16) Int16x32
+
 /* ConvertToInt16x8 */
 
 // ConvertToInt16x8 converts 8 lowest vector element values to int16.
@@ -1768,6 +1794,32 @@ func (x Uint64x4) ConvertToUint16Saturated() Uint16x8
 // Asm: VPMOVUSQW, CPU Feature: AVX512
 func (x Uint64x8) ConvertToUint16Saturated() Uint16x8
 
+/* ConvertToUint16SaturatedPacked */
+
+// ConvertToUint16SaturatedPacked converts element values to uint16 with saturation.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second second input vector will be packed to the upper part of the result vector.
+//
+// Asm: VPACKUSDW, CPU Feature: AVX
+func (x Uint32x4) ConvertToUint16SaturatedPacked(y Uint32x4) Uint16x8
+
+// ConvertToUint16SaturatedPacked converts element values to uint16 with saturation.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second second input vector will be packed to the upper part of the result vector.
+//
+// Asm: VPACKUSDW, CPU Feature: AVX2
+func (x Uint32x8) ConvertToUint16SaturatedPacked(y Uint32x8) Uint16x16
+
+// ConvertToUint16SaturatedPacked converts element values to uint16 with saturation.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second second input vector will be packed to the upper part of the result vector.
+//
+// Asm: VPACKUSDW, CPU Feature: AVX512
+func (x Uint32x16) ConvertToUint16SaturatedPacked(y Uint32x16) Uint16x32
+
 /* ConvertToUint16x8 */
 
 // ConvertToUint16x8 converts 8 lowest vector element values to uint16.