From baa0ae3aaacfcef6ae04beba78a2d2b06776e423 Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Fri, 19 Dec 2025 14:48:59 -0500 Subject: [PATCH] simd/archsimd: correct type and instruction for SaturateToUint8 It should be defined on unsigned types, not signed types, and use unsigned conversion instructions. Change-Id: I49694ccdf1d331cfde88591531c358d9886e83e6 Reviewed-on: https://go-review.googlesource.com/c/go/+/731500 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- src/cmd/compile/internal/amd64/simdssa.go | 32 ++ .../compile/internal/ssa/_gen/simdAMD64.rules | 32 +- .../compile/internal/ssa/_gen/simdAMD64ops.go | 24 ++ .../internal/ssa/_gen/simdgenericOps.go | 16 +- src/cmd/compile/internal/ssa/opGen.go | 402 +++++++++++++++++- src/cmd/compile/internal/ssa/rewriteAMD64.go | 272 +++++++++++- .../compile/internal/ssagen/simdintrinsics.go | 16 +- .../_gen/simdgen/ops/Converts/go.yaml | 6 +- src/simd/archsimd/ops_amd64.go | 40 +- 9 files changed, 758 insertions(+), 82 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index f6deba3ec1..13353c75a9 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -175,7 +175,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVSQD128_128, ssa.OpAMD64VPMOVSQD128_256, ssa.OpAMD64VPMOVSQD256, + ssa.OpAMD64VPMOVUSWB128_128, + ssa.OpAMD64VPMOVUSWB128_256, ssa.OpAMD64VPMOVUSWB256, + ssa.OpAMD64VPMOVUSDB128_128, + ssa.OpAMD64VPMOVUSDB128_256, + ssa.OpAMD64VPMOVUSDB128_512, + ssa.OpAMD64VPMOVUSQB128_128, + ssa.OpAMD64VPMOVUSQB128_256, + ssa.OpAMD64VPMOVUSQB128_512, ssa.OpAMD64VPMOVUSDW128_128, ssa.OpAMD64VPMOVUSDW128_256, ssa.OpAMD64VPMOVUSDW256, @@ -1010,7 +1018,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVSQDMasked128_128, ssa.OpAMD64VPMOVSQDMasked128_256, ssa.OpAMD64VPMOVSQDMasked256, + ssa.OpAMD64VPMOVUSWBMasked128_128, + ssa.OpAMD64VPMOVUSWBMasked128_256, ssa.OpAMD64VPMOVUSWBMasked256, + ssa.OpAMD64VPMOVUSDBMasked128_128, + ssa.OpAMD64VPMOVUSDBMasked128_256, + ssa.OpAMD64VPMOVUSDBMasked128_512, + ssa.OpAMD64VPMOVUSQBMasked128_128, + ssa.OpAMD64VPMOVUSQBMasked128_256, + ssa.OpAMD64VPMOVUSQBMasked128_512, ssa.OpAMD64VPMOVUSDWMasked128_128, ssa.OpAMD64VPMOVUSDWMasked128_256, ssa.OpAMD64VPMOVUSDWMasked256, @@ -2638,7 +2654,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVSQDMasked128_128Merging, ssa.OpAMD64VPMOVSQDMasked128_256Merging, ssa.OpAMD64VPMOVSQDMasked256Merging, + ssa.OpAMD64VPMOVUSWBMasked128_128Merging, + ssa.OpAMD64VPMOVUSWBMasked128_256Merging, ssa.OpAMD64VPMOVUSWBMasked256Merging, + ssa.OpAMD64VPMOVUSDBMasked128_128Merging, + ssa.OpAMD64VPMOVUSDBMasked128_256Merging, + ssa.OpAMD64VPMOVUSDBMasked128_512Merging, + ssa.OpAMD64VPMOVUSQBMasked128_128Merging, + ssa.OpAMD64VPMOVUSQBMasked128_256Merging, + ssa.OpAMD64VPMOVUSQBMasked128_512Merging, ssa.OpAMD64VPMOVUSDWMasked128_128Merging, ssa.OpAMD64VPMOVUSDWMasked128_256Merging, ssa.OpAMD64VPMOVUSDWMasked256Merging, @@ -3430,7 +3454,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVSQDMasked128_128, ssa.OpAMD64VPMOVSQDMasked128_256, ssa.OpAMD64VPMOVSQDMasked256, + ssa.OpAMD64VPMOVUSWBMasked128_128, + ssa.OpAMD64VPMOVUSWBMasked128_256, ssa.OpAMD64VPMOVUSWBMasked256, + ssa.OpAMD64VPMOVUSDBMasked128_128, + ssa.OpAMD64VPMOVUSDBMasked128_256, + ssa.OpAMD64VPMOVUSDBMasked128_512, + ssa.OpAMD64VPMOVUSQBMasked128_128, + ssa.OpAMD64VPMOVUSQBMasked128_256, + ssa.OpAMD64VPMOVUSQBMasked128_512, ssa.OpAMD64VPACKUSDWMasked128, ssa.OpAMD64VPACKUSDWMasked128load, ssa.OpAMD64VPACKUSDWMasked256, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 88d8567a3d..39d4f9b850 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -919,15 +919,15 @@ (SaturateToInt32Int64x2 ...) => (VPMOVSQD128_128 ...) (SaturateToInt32Int64x4 ...) => (VPMOVSQD128_256 ...) (SaturateToInt32Int64x8 ...) => (VPMOVSQD256 ...) -(SaturateToUint8Int16x8 ...) => (VPMOVSWB128_128 ...) -(SaturateToUint8Int16x16 ...) => (VPMOVSWB128_256 ...) -(SaturateToUint8Int32x4 ...) => (VPMOVSDB128_128 ...) -(SaturateToUint8Int32x8 ...) => (VPMOVSDB128_256 ...) -(SaturateToUint8Int32x16 ...) => (VPMOVSDB128_512 ...) -(SaturateToUint8Int64x2 ...) => (VPMOVSQB128_128 ...) -(SaturateToUint8Int64x4 ...) => (VPMOVSQB128_256 ...) -(SaturateToUint8Int64x8 ...) => (VPMOVSQB128_512 ...) +(SaturateToUint8Uint16x8 ...) => (VPMOVUSWB128_128 ...) +(SaturateToUint8Uint16x16 ...) => (VPMOVUSWB128_256 ...) (SaturateToUint8Uint16x32 ...) => (VPMOVUSWB256 ...) +(SaturateToUint8Uint32x4 ...) => (VPMOVUSDB128_128 ...) +(SaturateToUint8Uint32x8 ...) => (VPMOVUSDB128_256 ...) +(SaturateToUint8Uint32x16 ...) => (VPMOVUSDB128_512 ...) +(SaturateToUint8Uint64x2 ...) => (VPMOVUSQB128_128 ...) +(SaturateToUint8Uint64x4 ...) => (VPMOVUSQB128_256 ...) +(SaturateToUint8Uint64x8 ...) => (VPMOVUSQB128_512 ...) (SaturateToUint16Uint32x4 ...) => (VPMOVUSDW128_128 ...) (SaturateToUint16Uint32x8 ...) => (VPMOVUSDW128_256 ...) (SaturateToUint16Uint32x16 ...) => (VPMOVUSDW256 ...) @@ -1787,7 +1787,15 @@ (VMOVDQU64Masked128 (VPMOVSQD128_128 x) mask) => (VPMOVSQDMasked128_128 x mask) (VMOVDQU64Masked256 (VPMOVSQD128_256 x) mask) => (VPMOVSQDMasked128_256 x mask) (VMOVDQU64Masked256 (VPMOVSQD256 x) mask) => (VPMOVSQDMasked256 x mask) +(VMOVDQU16Masked128 (VPMOVUSWB128_128 x) mask) => (VPMOVUSWBMasked128_128 x mask) +(VMOVDQU16Masked256 (VPMOVUSWB128_256 x) mask) => (VPMOVUSWBMasked128_256 x mask) (VMOVDQU16Masked256 (VPMOVUSWB256 x) mask) => (VPMOVUSWBMasked256 x mask) +(VMOVDQU32Masked128 (VPMOVUSDB128_128 x) mask) => (VPMOVUSDBMasked128_128 x mask) +(VMOVDQU32Masked256 (VPMOVUSDB128_256 x) mask) => (VPMOVUSDBMasked128_256 x mask) +(VMOVDQU32Masked512 (VPMOVUSDB128_512 x) mask) => (VPMOVUSDBMasked128_512 x mask) +(VMOVDQU64Masked128 (VPMOVUSQB128_128 x) mask) => (VPMOVUSQBMasked128_128 x mask) +(VMOVDQU64Masked256 (VPMOVUSQB128_256 x) mask) => (VPMOVUSQBMasked128_256 x mask) +(VMOVDQU64Masked512 (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512 x mask) (VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) => (VPACKUSDWMasked128 x y mask) (VMOVDQU32Masked256 (VPACKUSDW256 x y) mask) => (VPACKUSDWMasked256 x y mask) (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) => (VPACKUSDWMasked512 x y mask) @@ -2018,6 +2026,7 @@ (VPBLENDMDMasked512 dst (VPMOVDW256 x) mask) => (VPMOVDWMasked256Merging dst x mask) (VPBLENDMDMasked512 dst (VPMOVSDB128_512 x) mask) => (VPMOVSDBMasked128_512Merging dst x mask) (VPBLENDMDMasked512 dst (VPMOVSDW256 x) mask) => (VPMOVSDWMasked256Merging dst x mask) +(VPBLENDMDMasked512 dst (VPMOVUSDB128_512 x) mask) => (VPMOVUSDBMasked128_512Merging dst x mask) (VPBLENDMDMasked512 dst (VPMOVUSDW256 x) mask) => (VPMOVUSDWMasked256Merging dst x mask) (VPBLENDMDMasked512 dst (VPMULLD512 x y) mask) => (VPMULLDMasked512Merging dst x y mask) (VPBLENDMDMasked512 dst (VPOPCNTD512 x) mask) => (VPOPCNTDMasked512Merging dst x mask) @@ -2071,6 +2080,7 @@ (VPBLENDMQMasked512 dst (VPMOVSQB128_512 x) mask) => (VPMOVSQBMasked128_512Merging dst x mask) (VPBLENDMQMasked512 dst (VPMOVSQD256 x) mask) => (VPMOVSQDMasked256Merging dst x mask) (VPBLENDMQMasked512 dst (VPMOVSQW128_512 x) mask) => (VPMOVSQWMasked128_512Merging dst x mask) +(VPBLENDMQMasked512 dst (VPMOVUSQB128_512 x) mask) => (VPMOVUSQBMasked128_512Merging dst x mask) (VPBLENDMQMasked512 dst (VPMOVUSQD256 x) mask) => (VPMOVUSQDMasked256Merging dst x mask) (VPBLENDMQMasked512 dst (VPMOVUSQW128_512 x) mask) => (VPMOVUSQWMasked128_512Merging dst x mask) (VPBLENDMQMasked512 dst (VPMULLQ512 x y) mask) => (VPMULLQMasked512Merging dst x y mask) @@ -2235,9 +2245,12 @@ (VPBLENDVB128 dst (VPMOVSXWQ128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked128Merging dst x (VPMOVVec16x8ToM mask)) (VPBLENDVB128 dst (VPMOVSXWQ256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked256Merging dst x (VPMOVVec16x8ToM mask)) (VPBLENDVB128 dst (VPMOVSXWQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWQMasked512Merging dst x (VPMOVVec16x8ToM mask)) +(VPBLENDVB128 dst (VPMOVUSDB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDBMasked128_128Merging dst x (VPMOVVec32x4ToM mask)) (VPBLENDVB128 dst (VPMOVUSDW128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDWMasked128_128Merging dst x (VPMOVVec32x4ToM mask)) +(VPBLENDVB128 dst (VPMOVUSQB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQBMasked128_128Merging dst x (VPMOVVec64x2ToM mask)) (VPBLENDVB128 dst (VPMOVUSQD128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQDMasked128_128Merging dst x (VPMOVVec64x2ToM mask)) (VPBLENDVB128 dst (VPMOVUSQW128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQWMasked128_128Merging dst x (VPMOVVec64x2ToM mask)) +(VPBLENDVB128 dst (VPMOVUSWB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSWBMasked128_128Merging dst x (VPMOVVec16x8ToM mask)) (VPBLENDVB128 dst (VPMOVWB128_128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVWBMasked128_128Merging dst x (VPMOVVec16x8ToM mask)) (VPBLENDVB128 dst (VPMOVZXBD128 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBDMasked128Merging dst x (VPMOVVec8x16ToM mask)) (VPBLENDVB128 dst (VPMOVZXBD256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBDMasked256Merging dst x (VPMOVVec8x16ToM mask)) @@ -2396,9 +2409,12 @@ (VPBLENDVB256 dst (VPMOVSXBW512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXBWMasked512Merging dst x (VPMOVVec8x32ToM mask)) (VPBLENDVB256 dst (VPMOVSXDQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXDQMasked512Merging dst x (VPMOVVec32x8ToM mask)) (VPBLENDVB256 dst (VPMOVSXWD512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVSXWDMasked512Merging dst x (VPMOVVec16x16ToM mask)) +(VPBLENDVB256 dst (VPMOVUSDB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDBMasked128_256Merging dst x (VPMOVVec32x8ToM mask)) (VPBLENDVB256 dst (VPMOVUSDW128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSDWMasked128_256Merging dst x (VPMOVVec32x8ToM mask)) +(VPBLENDVB256 dst (VPMOVUSQB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQBMasked128_256Merging dst x (VPMOVVec64x4ToM mask)) (VPBLENDVB256 dst (VPMOVUSQD128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQDMasked128_256Merging dst x (VPMOVVec64x4ToM mask)) (VPBLENDVB256 dst (VPMOVUSQW128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSQWMasked128_256Merging dst x (VPMOVVec64x4ToM mask)) +(VPBLENDVB256 dst (VPMOVUSWB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVUSWBMasked128_256Merging dst x (VPMOVVec16x16ToM mask)) (VPBLENDVB256 dst (VPMOVWB128_256 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVWBMasked128_256Merging dst x (VPMOVVec16x16ToM mask)) (VPBLENDVB256 dst (VPMOVZXBW512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXBWMasked512Merging dst x (VPMOVVec8x32ToM mask)) (VPBLENDVB256 dst (VPMOVZXDQ512 x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPMOVZXDQMasked512Merging dst x (VPMOVVec32x8ToM mask)) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index f38d24fde7..cd0cedc831 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -780,12 +780,24 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPMOVSXWQMasked128", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVSXWQMasked256", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPMOVSXWQMasked512", argLength: 2, reg: wkw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPMOVUSDB128_128", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSDB128_256", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSDB128_512", argLength: 1, reg: w11, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSDBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSDBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSDBMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSDW128_128", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSDW128_256", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSDW256", argLength: 1, reg: w11, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPMOVUSDWMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSDWMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSDWMasked256", argLength: 2, reg: wkw, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPMOVUSQB128_128", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSQB128_256", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSQB128_512", argLength: 1, reg: w11, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSQBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSQBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSQBMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSQD128_128", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSQD128_256", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSQD256", argLength: 1, reg: w11, asm: "VPMOVUSQD", commutative: false, typ: "Vec256", resultInArg0: false}, @@ -798,7 +810,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPMOVUSQWMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSQWMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSQWMasked128_512", argLength: 2, reg: wkw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSWB128_128", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSWB128_256", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSWB256", argLength: 1, reg: w11, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPMOVUSWBMasked128_128", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPMOVUSWBMasked128_256", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVUSWBMasked256", argLength: 2, reg: wkw, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPMOVWB128_128", argLength: 1, reg: w11, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMOVWB128_256", argLength: 1, reg: w11, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: false}, @@ -2382,15 +2398,23 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPMOVSXWQMasked128Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVSXWQMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPMOVSXWQMasked512Merging", argLength: 3, reg: w2kw, asm: "VPMOVSXWQ", commutative: false, typ: "Vec512", resultInArg0: true}, + {name: "VPMOVUSDBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPMOVUSDBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPMOVUSDBMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDB", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSDWMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSDWMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSDWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSDW", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPMOVUSQBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPMOVUSQBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPMOVUSQBMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQB", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSQDMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSQDMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSQDMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPMOVUSQWMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSQWMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSQWMasked128_512Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSQW", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPMOVUSWBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPMOVUSWBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVUSWBMasked256Merging", argLength: 3, reg: w2kw, asm: "VPMOVUSWB", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPMOVWBMasked128_128Merging", argLength: 3, reg: w2kw, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPMOVWBMasked128_256Merging", argLength: 3, reg: w2kw, asm: "VPMOVWB", commutative: false, typ: "Vec128", resultInArg0: true}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index a68d8c4122..36f3703bf1 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -842,15 +842,15 @@ func simdGenericOps() []opData { {name: "SaturateToInt32Int64x2", argLength: 1, commutative: false}, {name: "SaturateToInt32Int64x4", argLength: 1, commutative: false}, {name: "SaturateToInt32Int64x8", argLength: 1, commutative: false}, - {name: "SaturateToUint8Int16x8", argLength: 1, commutative: false}, - {name: "SaturateToUint8Int16x16", argLength: 1, commutative: false}, - {name: "SaturateToUint8Int32x4", argLength: 1, commutative: false}, - {name: "SaturateToUint8Int32x8", argLength: 1, commutative: false}, - {name: "SaturateToUint8Int32x16", argLength: 1, commutative: false}, - {name: "SaturateToUint8Int64x2", argLength: 1, commutative: false}, - {name: "SaturateToUint8Int64x4", argLength: 1, commutative: false}, - {name: "SaturateToUint8Int64x8", argLength: 1, commutative: false}, + {name: "SaturateToUint8Uint16x8", argLength: 1, commutative: false}, + {name: "SaturateToUint8Uint16x16", argLength: 1, commutative: false}, {name: "SaturateToUint8Uint16x32", argLength: 1, commutative: false}, + {name: "SaturateToUint8Uint32x4", argLength: 1, commutative: false}, + {name: "SaturateToUint8Uint32x8", argLength: 1, commutative: false}, + {name: "SaturateToUint8Uint32x16", argLength: 1, commutative: false}, + {name: "SaturateToUint8Uint64x2", argLength: 1, commutative: false}, + {name: "SaturateToUint8Uint64x4", argLength: 1, commutative: false}, + {name: "SaturateToUint8Uint64x8", argLength: 1, commutative: false}, {name: "SaturateToUint16ConcatUint32x4", argLength: 2, commutative: false}, {name: "SaturateToUint16ConcatUint32x8", argLength: 2, commutative: false}, {name: "SaturateToUint16ConcatUint32x16", argLength: 2, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 00d581ec9a..71ad2c2a9a 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2021,12 +2021,24 @@ const ( OpAMD64VPMOVSXWQMasked128 OpAMD64VPMOVSXWQMasked256 OpAMD64VPMOVSXWQMasked512 + OpAMD64VPMOVUSDB128_128 + OpAMD64VPMOVUSDB128_256 + OpAMD64VPMOVUSDB128_512 + OpAMD64VPMOVUSDBMasked128_128 + OpAMD64VPMOVUSDBMasked128_256 + OpAMD64VPMOVUSDBMasked128_512 OpAMD64VPMOVUSDW128_128 OpAMD64VPMOVUSDW128_256 OpAMD64VPMOVUSDW256 OpAMD64VPMOVUSDWMasked128_128 OpAMD64VPMOVUSDWMasked128_256 OpAMD64VPMOVUSDWMasked256 + OpAMD64VPMOVUSQB128_128 + OpAMD64VPMOVUSQB128_256 + OpAMD64VPMOVUSQB128_512 + OpAMD64VPMOVUSQBMasked128_128 + OpAMD64VPMOVUSQBMasked128_256 + OpAMD64VPMOVUSQBMasked128_512 OpAMD64VPMOVUSQD128_128 OpAMD64VPMOVUSQD128_256 OpAMD64VPMOVUSQD256 @@ -2039,7 +2051,11 @@ const ( OpAMD64VPMOVUSQWMasked128_128 OpAMD64VPMOVUSQWMasked128_256 OpAMD64VPMOVUSQWMasked128_512 + OpAMD64VPMOVUSWB128_128 + OpAMD64VPMOVUSWB128_256 OpAMD64VPMOVUSWB256 + OpAMD64VPMOVUSWBMasked128_128 + OpAMD64VPMOVUSWBMasked128_256 OpAMD64VPMOVUSWBMasked256 OpAMD64VPMOVWB128_128 OpAMD64VPMOVWB128_256 @@ -3623,15 +3639,23 @@ const ( OpAMD64VPMOVSXWQMasked128Merging OpAMD64VPMOVSXWQMasked256Merging OpAMD64VPMOVSXWQMasked512Merging + OpAMD64VPMOVUSDBMasked128_128Merging + OpAMD64VPMOVUSDBMasked128_256Merging + OpAMD64VPMOVUSDBMasked128_512Merging OpAMD64VPMOVUSDWMasked128_128Merging OpAMD64VPMOVUSDWMasked128_256Merging OpAMD64VPMOVUSDWMasked256Merging + OpAMD64VPMOVUSQBMasked128_128Merging + OpAMD64VPMOVUSQBMasked128_256Merging + OpAMD64VPMOVUSQBMasked128_512Merging OpAMD64VPMOVUSQDMasked128_128Merging OpAMD64VPMOVUSQDMasked128_256Merging OpAMD64VPMOVUSQDMasked256Merging OpAMD64VPMOVUSQWMasked128_128Merging OpAMD64VPMOVUSQWMasked128_256Merging OpAMD64VPMOVUSQWMasked128_512Merging + OpAMD64VPMOVUSWBMasked128_128Merging + OpAMD64VPMOVUSWBMasked128_256Merging OpAMD64VPMOVUSWBMasked256Merging OpAMD64VPMOVWBMasked128_128Merging OpAMD64VPMOVWBMasked128_256Merging @@ -6992,15 +7016,15 @@ const ( OpSaturateToInt32Int64x2 OpSaturateToInt32Int64x4 OpSaturateToInt32Int64x8 - OpSaturateToUint8Int16x8 - OpSaturateToUint8Int16x16 - OpSaturateToUint8Int32x4 - OpSaturateToUint8Int32x8 - OpSaturateToUint8Int32x16 - OpSaturateToUint8Int64x2 - OpSaturateToUint8Int64x4 - OpSaturateToUint8Int64x8 + OpSaturateToUint8Uint16x8 + OpSaturateToUint8Uint16x16 OpSaturateToUint8Uint16x32 + OpSaturateToUint8Uint32x4 + OpSaturateToUint8Uint32x8 + OpSaturateToUint8Uint32x16 + OpSaturateToUint8Uint64x2 + OpSaturateToUint8Uint64x4 + OpSaturateToUint8Uint64x8 OpSaturateToUint16ConcatUint32x4 OpSaturateToUint16ConcatUint32x8 OpSaturateToUint16ConcatUint32x16 @@ -32103,6 +32127,87 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMOVUSDB128_128", + argLen: 1, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSDB128_256", + argLen: 1, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSDB128_512", + argLen: 1, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSDBMasked128_128", + argLen: 2, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSDBMasked128_256", + argLen: 2, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSDBMasked128_512", + argLen: 2, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPMOVUSDW128_128", argLen: 1, @@ -32184,6 +32289,87 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMOVUSQB128_128", + argLen: 1, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSQB128_256", + argLen: 1, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSQB128_512", + argLen: 1, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSQBMasked128_128", + argLen: 2, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSQBMasked128_256", + argLen: 2, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSQBMasked128_512", + argLen: 2, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPMOVUSQD128_128", argLen: 1, @@ -32346,6 +32532,32 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMOVUSWB128_128", + argLen: 1, + asm: x86.AVPMOVUSWB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSWB128_256", + argLen: 1, + asm: x86.AVPMOVUSWB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPMOVUSWB256", argLen: 1, @@ -32359,6 +32571,34 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMOVUSWBMasked128_128", + argLen: 2, + asm: x86.AVPMOVUSWB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSWBMasked128_256", + argLen: 2, + asm: x86.AVPMOVUSWB, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPMOVUSWBMasked256", argLen: 2, @@ -57268,6 +57508,54 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMOVUSDBMasked128_128Merging", + argLen: 3, + resultInArg0: true, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSDBMasked128_256Merging", + argLen: 3, + resultInArg0: true, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSDBMasked128_512Merging", + argLen: 3, + resultInArg0: true, + asm: x86.AVPMOVUSDB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPMOVUSDWMasked128_128Merging", argLen: 3, @@ -57316,6 +57604,54 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMOVUSQBMasked128_128Merging", + argLen: 3, + resultInArg0: true, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSQBMasked128_256Merging", + argLen: 3, + resultInArg0: true, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSQBMasked128_512Merging", + argLen: 3, + resultInArg0: true, + asm: x86.AVPMOVUSQB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPMOVUSQDMasked128_128Merging", argLen: 3, @@ -57412,6 +57748,38 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPMOVUSWBMasked128_128Merging", + argLen: 3, + resultInArg0: true, + asm: x86.AVPMOVUSWB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPMOVUSWBMasked128_256Merging", + argLen: 3, + resultInArg0: true, + asm: x86.AVPMOVUSWB, + reg: regInfo{ + inputs: []inputInfo{ + {2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPMOVUSWBMasked256Merging", argLen: 3, @@ -93430,47 +93798,47 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "SaturateToUint8Int16x8", + name: "SaturateToUint8Uint16x8", argLen: 1, generic: true, }, { - name: "SaturateToUint8Int16x16", + name: "SaturateToUint8Uint16x16", argLen: 1, generic: true, }, { - name: "SaturateToUint8Int32x4", + name: "SaturateToUint8Uint16x32", argLen: 1, generic: true, }, { - name: "SaturateToUint8Int32x8", + name: "SaturateToUint8Uint32x4", argLen: 1, generic: true, }, { - name: "SaturateToUint8Int32x16", + name: "SaturateToUint8Uint32x8", argLen: 1, generic: true, }, { - name: "SaturateToUint8Int64x2", + name: "SaturateToUint8Uint32x16", argLen: 1, generic: true, }, { - name: "SaturateToUint8Int64x4", + name: "SaturateToUint8Uint64x2", argLen: 1, generic: true, }, { - name: "SaturateToUint8Int64x8", + name: "SaturateToUint8Uint64x4", argLen: 1, generic: true, }, { - name: "SaturateToUint8Uint16x32", + name: "SaturateToUint8Uint64x8", argLen: 1, generic: true, }, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 797757c322..9efc566c48 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -5139,32 +5139,32 @@ func rewriteValueAMD64(v *Value) bool { case OpSaturateToUint32Uint64x8: v.Op = OpAMD64VPMOVUSQD256 return true - case OpSaturateToUint8Int16x16: - v.Op = OpAMD64VPMOVSWB128_256 + case OpSaturateToUint8Uint16x16: + v.Op = OpAMD64VPMOVUSWB128_256 return true - case OpSaturateToUint8Int16x8: - v.Op = OpAMD64VPMOVSWB128_128 + case OpSaturateToUint8Uint16x32: + v.Op = OpAMD64VPMOVUSWB256 return true - case OpSaturateToUint8Int32x16: - v.Op = OpAMD64VPMOVSDB128_512 + case OpSaturateToUint8Uint16x8: + v.Op = OpAMD64VPMOVUSWB128_128 return true - case OpSaturateToUint8Int32x4: - v.Op = OpAMD64VPMOVSDB128_128 + case OpSaturateToUint8Uint32x16: + v.Op = OpAMD64VPMOVUSDB128_512 return true - case OpSaturateToUint8Int32x8: - v.Op = OpAMD64VPMOVSDB128_256 + case OpSaturateToUint8Uint32x4: + v.Op = OpAMD64VPMOVUSDB128_128 return true - case OpSaturateToUint8Int64x2: - v.Op = OpAMD64VPMOVSQB128_128 + case OpSaturateToUint8Uint32x8: + v.Op = OpAMD64VPMOVUSDB128_256 return true - case OpSaturateToUint8Int64x4: - v.Op = OpAMD64VPMOVSQB128_256 + case OpSaturateToUint8Uint64x2: + v.Op = OpAMD64VPMOVUSQB128_128 return true - case OpSaturateToUint8Int64x8: - v.Op = OpAMD64VPMOVSQB128_512 + case OpSaturateToUint8Uint64x4: + v.Op = OpAMD64VPMOVUSQB128_256 return true - case OpSaturateToUint8Uint16x32: - v.Op = OpAMD64VPMOVUSWB256 + case OpSaturateToUint8Uint64x8: + v.Op = OpAMD64VPMOVUSQB128_512 return true case OpScaleFloat32x16: v.Op = OpAMD64VSCALEFPS512 @@ -33775,6 +33775,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked128(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU16Masked128 (VPMOVUSWB128_128 x) mask) + // result: (VPMOVUSWBMasked128_128 x mask) + for { + if v_0.Op != OpAMD64VPMOVUSWB128_128 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVUSWBMasked128_128) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU16Masked128 (VPSHLDW128 [a] x y) mask) // result: (VPSHLDWMasked128 [a] x y mask) for { @@ -34327,6 +34339,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked256(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU16Masked256 (VPMOVUSWB128_256 x) mask) + // result: (VPMOVUSWBMasked128_256 x mask) + for { + if v_0.Op != OpAMD64VPMOVUSWB128_256 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVUSWBMasked128_256) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU16Masked256 (VPMOVUSWB256 x) mask) // result: (VPMOVUSWBMasked256 x mask) for { @@ -35607,6 +35631,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked128(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU32Masked128 (VPMOVUSDB128_128 x) mask) + // result: (VPMOVUSDBMasked128_128 x mask) + for { + if v_0.Op != OpAMD64VPMOVUSDB128_128 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVUSDBMasked128_128) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU32Masked128 (VPACKUSDW128 x y) mask) // result: (VPACKUSDWMasked128 x y mask) for { @@ -36480,6 +36516,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked256(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU32Masked256 (VPMOVUSDB128_256 x) mask) + // result: (VPMOVUSDBMasked128_256 x mask) + for { + if v_0.Op != OpAMD64VPMOVUSDB128_256 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVUSDBMasked128_256) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU32Masked256 (VPACKUSDW256 x y) mask) // result: (VPACKUSDWMasked256 x y mask) for { @@ -37416,6 +37464,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU32Masked512 (VPMOVUSDB128_512 x) mask) + // result: (VPMOVUSDBMasked128_512 x mask) + for { + if v_0.Op != OpAMD64VPMOVUSDB128_512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVUSDBMasked128_512) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU32Masked512 (VPACKUSDW512 x y) mask) // result: (VPACKUSDWMasked512 x y mask) for { @@ -38259,6 +38319,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked128(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU64Masked128 (VPMOVUSQB128_128 x) mask) + // result: (VPMOVUSQBMasked128_128 x mask) + for { + if v_0.Op != OpAMD64VPMOVUSQB128_128 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVUSQBMasked128_128) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU64Masked128 (VPMOVUSQW128_128 x) mask) // result: (VPMOVUSQWMasked128_128 x mask) for { @@ -39100,6 +39172,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked256(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU64Masked256 (VPMOVUSQB128_256 x) mask) + // result: (VPMOVUSQBMasked128_256 x mask) + for { + if v_0.Op != OpAMD64VPMOVUSQB128_256 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVUSQBMasked128_256) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU64Masked256 (VPMOVUSQW128_256 x) mask) // result: (VPMOVUSQWMasked128_256 x mask) for { @@ -39920,6 +40004,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool { v.AddArg2(x, mask) return true } + // match: (VMOVDQU64Masked512 (VPMOVUSQB128_512 x) mask) + // result: (VPMOVUSQBMasked128_512 x mask) + for { + if v_0.Op != OpAMD64VPMOVUSQB128_512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPMOVUSQBMasked128_512) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU64Masked512 (VPMOVUSQW128_512 x) mask) // result: (VPMOVUSQWMasked128_512 x mask) for { @@ -44109,6 +44205,19 @@ func rewriteValueAMD64_OpAMD64VPBLENDMDMasked512(v *Value) bool { v.AddArg3(dst, x, mask) return true } + // match: (VPBLENDMDMasked512 dst (VPMOVUSDB128_512 x) mask) + // result: (VPMOVUSDBMasked128_512Merging dst x mask) + for { + dst := v_0 + if v_1.Op != OpAMD64VPMOVUSDB128_512 { + break + } + x := v_1.Args[0] + mask := v_2 + v.reset(OpAMD64VPMOVUSDBMasked128_512Merging) + v.AddArg3(dst, x, mask) + return true + } // match: (VPBLENDMDMasked512 dst (VPMOVUSDW256 x) mask) // result: (VPMOVUSDWMasked256Merging dst x mask) for { @@ -44869,6 +44978,19 @@ func rewriteValueAMD64_OpAMD64VPBLENDMQMasked512(v *Value) bool { v.AddArg3(dst, x, mask) return true } + // match: (VPBLENDMQMasked512 dst (VPMOVUSQB128_512 x) mask) + // result: (VPMOVUSQBMasked128_512Merging dst x mask) + for { + dst := v_0 + if v_1.Op != OpAMD64VPMOVUSQB128_512 { + break + } + x := v_1.Args[0] + mask := v_2 + v.reset(OpAMD64VPMOVUSQBMasked128_512Merging) + v.AddArg3(dst, x, mask) + return true + } // match: (VPBLENDMQMasked512 dst (VPMOVUSQD256 x) mask) // result: (VPMOVUSQDMasked256Merging dst x mask) for { @@ -47797,6 +47919,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool { v.AddArg3(dst, x, v0) return true } + // match: (VPBLENDVB128 dst (VPMOVUSDB128_128 x) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPMOVUSDBMasked128_128Merging dst x (VPMOVVec32x4ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPMOVUSDB128_128 { + break + } + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPMOVUSDBMasked128_128Merging) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(dst, x, v0) + return true + } // match: (VPBLENDVB128 dst (VPMOVUSDW128_128 x) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPMOVUSDWMasked128_128Merging dst x (VPMOVVec32x4ToM mask)) @@ -47816,6 +47957,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool { v.AddArg3(dst, x, v0) return true } + // match: (VPBLENDVB128 dst (VPMOVUSQB128_128 x) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPMOVUSQBMasked128_128Merging dst x (VPMOVVec64x2ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPMOVUSQB128_128 { + break + } + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPMOVUSQBMasked128_128Merging) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(dst, x, v0) + return true + } // match: (VPBLENDVB128 dst (VPMOVUSQD128_128 x) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPMOVUSQDMasked128_128Merging dst x (VPMOVVec64x2ToM mask)) @@ -47854,6 +48014,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB128(v *Value) bool { v.AddArg3(dst, x, v0) return true } + // match: (VPBLENDVB128 dst (VPMOVUSWB128_128 x) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPMOVUSWBMasked128_128Merging dst x (VPMOVVec16x8ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPMOVUSWB128_128 { + break + } + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPMOVUSWBMasked128_128Merging) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(dst, x, v0) + return true + } // match: (VPBLENDVB128 dst (VPMOVWB128_128 x) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPMOVWBMasked128_128Merging dst x (VPMOVVec16x8ToM mask)) @@ -50990,6 +51169,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool { v.AddArg3(dst, x, v0) return true } + // match: (VPBLENDVB256 dst (VPMOVUSDB128_256 x) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPMOVUSDBMasked128_256Merging dst x (VPMOVVec32x8ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPMOVUSDB128_256 { + break + } + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPMOVUSDBMasked128_256Merging) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(dst, x, v0) + return true + } // match: (VPBLENDVB256 dst (VPMOVUSDW128_256 x) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPMOVUSDWMasked128_256Merging dst x (VPMOVVec32x8ToM mask)) @@ -51009,6 +51207,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool { v.AddArg3(dst, x, v0) return true } + // match: (VPBLENDVB256 dst (VPMOVUSQB128_256 x) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPMOVUSQBMasked128_256Merging dst x (VPMOVVec64x4ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPMOVUSQB128_256 { + break + } + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPMOVUSQBMasked128_256Merging) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(dst, x, v0) + return true + } // match: (VPBLENDVB256 dst (VPMOVUSQD128_256 x) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPMOVUSQDMasked128_256Merging dst x (VPMOVVec64x4ToM mask)) @@ -51047,6 +51264,25 @@ func rewriteValueAMD64_OpAMD64VPBLENDVB256(v *Value) bool { v.AddArg3(dst, x, v0) return true } + // match: (VPBLENDVB256 dst (VPMOVUSWB128_256 x) mask) + // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) + // result: (VPMOVUSWBMasked128_256Merging dst x (VPMOVVec16x16ToM mask)) + for { + dst := v_0 + if v_1.Op != OpAMD64VPMOVUSWB128_256 { + break + } + x := v_1.Args[0] + mask := v_2 + if !(v.Block.CPUfeatures.hasFeature(CPUavx512)) { + break + } + v.reset(OpAMD64VPMOVUSWBMasked128_256Merging) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(dst, x, v0) + return true + } // match: (VPBLENDVB256 dst (VPMOVWB128_256 x) mask) // cond: v.Block.CPUfeatures.hasFeature(CPUavx512) // result: (VPMOVWBMasked128_256Merging dst x (VPMOVVec16x16ToM mask)) diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 7eb5456994..22cf50d491 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -931,15 +931,15 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int64x2.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x4.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x8.SaturateToInt32", opLen1(ssa.OpSaturateToInt32Int64x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int16x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int32x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x2.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Int64x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x32.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint16x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x16.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint32x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x2.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x4.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x8.SaturateToUint8", opLen1(ssa.OpSaturateToUint8Uint64x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x4.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x16.SaturateToUint16", opLen1(ssa.OpSaturateToUint16Uint32x16, types.TypeVec256), sys.AMD64) diff --git a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml index 64cd4cb24e..1d688b434d 100644 --- a/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml +++ b/src/simd/archsimd/_gen/simdgen/ops/Converts/go.yaml @@ -390,11 +390,11 @@ bits: 128 - go: SaturateToUint8 regexpTag: "convert" - asm: "VPMOVS[WDQ]B" + asm: "VPMOVUS[WDQ]B" in: - - base: int + - base: uint out: - - base: int + - base: uint bits: 128 - go: SaturateToInt8 regexpTag: "convert" diff --git a/src/simd/archsimd/ops_amd64.go b/src/simd/archsimd/ops_amd64.go index 8e32533aec..304c0c0796 100644 --- a/src/simd/archsimd/ops_amd64.go +++ b/src/simd/archsimd/ops_amd64.go @@ -5464,53 +5464,53 @@ func (x Int64x8) SaturateToInt32() Int32x8 // SaturateToUint8 converts element values to uint8 with unsigned saturation. // Results are packed to low elements in the returned vector, its upper elements are zeroed. // -// Asm: VPMOVSWB, CPU Feature: AVX512 -func (x Int16x8) SaturateToUint8() Int8x16 +// Asm: VPMOVUSWB, CPU Feature: AVX512 +func (x Uint16x8) SaturateToUint8() Uint8x16 // SaturateToUint8 converts element values to uint8 with unsigned saturation. // -// Asm: VPMOVSWB, CPU Feature: AVX512 -func (x Int16x16) SaturateToUint8() Int8x16 +// Asm: VPMOVUSWB, CPU Feature: AVX512 +func (x Uint16x16) SaturateToUint8() Uint8x16 // SaturateToUint8 converts element values to uint8 with unsigned saturation. -// Results are packed to low elements in the returned vector, its upper elements are zeroed. // -// Asm: VPMOVSDB, CPU Feature: AVX512 -func (x Int32x4) SaturateToUint8() Int8x16 +// Asm: VPMOVUSWB, CPU Feature: AVX512 +func (x Uint16x32) SaturateToUint8() Uint8x32 // SaturateToUint8 converts element values to uint8 with unsigned saturation. // Results are packed to low elements in the returned vector, its upper elements are zeroed. // -// Asm: VPMOVSDB, CPU Feature: AVX512 -func (x Int32x8) SaturateToUint8() Int8x16 +// Asm: VPMOVUSDB, CPU Feature: AVX512 +func (x Uint32x4) SaturateToUint8() Uint8x16 // SaturateToUint8 converts element values to uint8 with unsigned saturation. +// Results are packed to low elements in the returned vector, its upper elements are zeroed. // -// Asm: VPMOVSDB, CPU Feature: AVX512 -func (x Int32x16) SaturateToUint8() Int8x16 +// Asm: VPMOVUSDB, CPU Feature: AVX512 +func (x Uint32x8) SaturateToUint8() Uint8x16 // SaturateToUint8 converts element values to uint8 with unsigned saturation. -// Results are packed to low elements in the returned vector, its upper elements are zeroed. // -// Asm: VPMOVSQB, CPU Feature: AVX512 -func (x Int64x2) SaturateToUint8() Int8x16 +// Asm: VPMOVUSDB, CPU Feature: AVX512 +func (x Uint32x16) SaturateToUint8() Uint8x16 // SaturateToUint8 converts element values to uint8 with unsigned saturation. // Results are packed to low elements in the returned vector, its upper elements are zeroed. // -// Asm: VPMOVSQB, CPU Feature: AVX512 -func (x Int64x4) SaturateToUint8() Int8x16 +// Asm: VPMOVUSQB, CPU Feature: AVX512 +func (x Uint64x2) SaturateToUint8() Uint8x16 // SaturateToUint8 converts element values to uint8 with unsigned saturation. // Results are packed to low elements in the returned vector, its upper elements are zeroed. // -// Asm: VPMOVSQB, CPU Feature: AVX512 -func (x Int64x8) SaturateToUint8() Int8x16 +// Asm: VPMOVUSQB, CPU Feature: AVX512 +func (x Uint64x4) SaturateToUint8() Uint8x16 // SaturateToUint8 converts element values to uint8 with unsigned saturation. +// Results are packed to low elements in the returned vector, its upper elements are zeroed. // -// Asm: VPMOVUSWB, CPU Feature: AVX512 -func (x Uint16x32) SaturateToUint8() Uint8x32 +// Asm: VPMOVUSQB, CPU Feature: AVX512 +func (x Uint64x8) SaturateToUint8() Uint8x16 /* SaturateToUint16 */ -- 2.52.0