From c39b2fdd1ec86f68668141a0901d5f3fc634854e Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Mon, 8 Sep 2025 19:38:56 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: add VPLZCNT[DQ] Change-Id: Ifd6d8c12deac9c41722fdf2511d860a334e83438 Reviewed-on: https://go-review.googlesource.com/c/go/+/701915 Reviewed-by: Cherry Mui TryBot-Bypass: Junyang Shao --- src/cmd/compile/internal/amd64/simdssa.go | 18 ++ .../compile/internal/ssa/_gen/simdAMD64.rules | 14 + .../compile/internal/ssa/_gen/simdAMD64ops.go | 12 + .../internal/ssa/_gen/simdgenericOps.go | 12 + src/cmd/compile/internal/ssa/opGen.go | 246 ++++++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 60 +++++ .../compile/internal/ssagen/simdintrinsics.go | 12 + .../_gen/simdgen/ops/Others/categories.yaml | 5 + src/simd/_gen/simdgen/ops/Others/go.yaml | 8 + src/simd/internal/simd_test/simd_test.go | 17 ++ src/simd/ops_amd64.go | 62 +++++ 11 files changed, 466 insertions(+) create mode 100644 src/simd/_gen/simdgen/ops/Others/categories.yaml create mode 100644 src/simd/_gen/simdgen/ops/Others/go.yaml diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 33f6669300..1c289507e1 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -110,6 +110,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMOVZXBQ256, ssa.OpAMD64VPMOVZXWQ256, ssa.OpAMD64VPMOVZXBQ512, + ssa.OpAMD64VPLZCNTD128, + ssa.OpAMD64VPLZCNTD256, + ssa.OpAMD64VPLZCNTD512, + ssa.OpAMD64VPLZCNTQ128, + ssa.OpAMD64VPLZCNTQ256, + ssa.OpAMD64VPLZCNTQ512, ssa.OpAMD64VPOPCNTB128, ssa.OpAMD64VPOPCNTB256, ssa.OpAMD64VPOPCNTB512, @@ -863,6 +869,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPEXPANDQMasked128, ssa.OpAMD64VPEXPANDQMasked256, ssa.OpAMD64VPEXPANDQMasked512, + ssa.OpAMD64VPLZCNTDMasked128, + ssa.OpAMD64VPLZCNTDMasked256, + ssa.OpAMD64VPLZCNTDMasked512, + ssa.OpAMD64VPLZCNTQMasked128, + ssa.OpAMD64VPLZCNTQMasked256, + ssa.OpAMD64VPLZCNTQMasked512, ssa.OpAMD64VPOPCNTBMasked128, ssa.OpAMD64VPOPCNTBMasked256, ssa.OpAMD64VPOPCNTBMasked512, @@ -1581,6 +1593,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VGF2P8MULBMasked128, ssa.OpAMD64VGF2P8MULBMasked256, ssa.OpAMD64VGF2P8MULBMasked512, + ssa.OpAMD64VPLZCNTDMasked128, + ssa.OpAMD64VPLZCNTDMasked256, + ssa.OpAMD64VPLZCNTDMasked512, + ssa.OpAMD64VPLZCNTQMasked128, + ssa.OpAMD64VPLZCNTQMasked256, + ssa.OpAMD64VPLZCNTQMasked512, ssa.OpAMD64VMAXPSMasked128, ssa.OpAMD64VMAXPSMasked256, ssa.OpAMD64VMAXPSMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 35ef1d35b6..bfedad1e9b 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -562,6 +562,18 @@ (IsNanFloat64x2 x y) => (VCMPPD128 [3] x y) (IsNanFloat64x4 x y) => (VCMPPD256 [3] x y) (IsNanFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [3] x y)) +(LeadingZerosInt32x4 ...) => (VPLZCNTD128 ...) +(LeadingZerosInt32x8 ...) => (VPLZCNTD256 ...) +(LeadingZerosInt32x16 ...) => (VPLZCNTD512 ...) +(LeadingZerosInt64x2 ...) => (VPLZCNTQ128 ...) +(LeadingZerosInt64x4 ...) => (VPLZCNTQ256 ...) +(LeadingZerosInt64x8 ...) => (VPLZCNTQ512 ...) +(LeadingZerosUint32x4 ...) => (VPLZCNTD128 ...) +(LeadingZerosUint32x8 ...) => (VPLZCNTD256 ...) +(LeadingZerosUint32x16 ...) => (VPLZCNTD512 ...) +(LeadingZerosUint64x2 ...) => (VPLZCNTQ128 ...) +(LeadingZerosUint64x4 ...) => (VPLZCNTQ256 ...) +(LeadingZerosUint64x8 ...) => (VPLZCNTQ512 ...) (LessFloat32x4 x y) => (VCMPPS128 [1] x y) (LessFloat32x8 x y) => (VCMPPS256 [1] x y) (LessFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [1] x y)) @@ -1334,6 +1346,8 @@ (VMOVDQU8Masked512 (VGF2P8AFFINEINVQB512 [a] x y) mask) => (VGF2P8AFFINEINVQBMasked512 [a] x y mask) (VMOVDQU8Masked512 (VGF2P8AFFINEQB512 [a] x y) mask) => (VGF2P8AFFINEQBMasked512 [a] x y mask) (VMOVDQU8Masked512 (VGF2P8MULB512 x y) mask) => (VGF2P8MULBMasked512 x y mask) +(VMOVDQU32Masked512 (VPLZCNTD512 x) mask) => (VPLZCNTDMasked512 x mask) +(VMOVDQU64Masked512 (VPLZCNTQ512 x) mask) => (VPLZCNTQMasked512 x mask) (VMOVDQU32Masked512 (VMAXPS512 x y) mask) => (VMAXPSMasked512 x y mask) (VMOVDQU64Masked512 (VMAXPD512 x y) mask) => (VMAXPDMasked512 x y mask) (VMOVDQU8Masked512 (VPMAXSB512 x y) mask) => (VPMAXSBMasked512 x y mask) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 1448f8776a..9143f25bca 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -450,6 +450,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPHSUBSW256", argLength: 2, reg: v21, asm: "VPHSUBSW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPHSUBW128", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPHSUBW256", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPLZCNTD128", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPLZCNTD256", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPLZCNTD512", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPLZCNTDMasked128", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPLZCNTDMasked256", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPLZCNTDMasked512", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPLZCNTQ128", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPLZCNTQ256", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPLZCNTQ512", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPLZCNTQMasked128", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VPLZCNTQMasked256", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPLZCNTQMasked512", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPMADDUBSW128", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMADDUBSW256", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPMADDUBSW512", argLength: 2, reg: w21, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 11c5785f7d..7ee4989d89 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -526,6 +526,18 @@ func simdGenericOps() []opData { {name: "IsNanFloat64x2", argLength: 2, commutative: true}, {name: "IsNanFloat64x4", argLength: 2, commutative: true}, {name: "IsNanFloat64x8", argLength: 2, commutative: true}, + {name: "LeadingZerosInt32x4", argLength: 1, commutative: false}, + {name: "LeadingZerosInt32x8", argLength: 1, commutative: false}, + {name: "LeadingZerosInt32x16", argLength: 1, commutative: false}, + {name: "LeadingZerosInt64x2", argLength: 1, commutative: false}, + {name: "LeadingZerosInt64x4", argLength: 1, commutative: false}, + {name: "LeadingZerosInt64x8", argLength: 1, commutative: false}, + {name: "LeadingZerosUint32x4", argLength: 1, commutative: false}, + {name: "LeadingZerosUint32x8", argLength: 1, commutative: false}, + {name: "LeadingZerosUint32x16", argLength: 1, commutative: false}, + {name: "LeadingZerosUint64x2", argLength: 1, commutative: false}, + {name: "LeadingZerosUint64x4", argLength: 1, commutative: false}, + {name: "LeadingZerosUint64x8", argLength: 1, commutative: false}, {name: "LessEqualFloat32x4", argLength: 2, commutative: false}, {name: "LessEqualFloat32x8", argLength: 2, commutative: false}, {name: "LessEqualFloat32x16", argLength: 2, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 9fc6059865..8719602036 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1682,6 +1682,18 @@ const ( OpAMD64VPHSUBSW256 OpAMD64VPHSUBW128 OpAMD64VPHSUBW256 + OpAMD64VPLZCNTD128 + OpAMD64VPLZCNTD256 + OpAMD64VPLZCNTD512 + OpAMD64VPLZCNTDMasked128 + OpAMD64VPLZCNTDMasked256 + OpAMD64VPLZCNTDMasked512 + OpAMD64VPLZCNTQ128 + OpAMD64VPLZCNTQ256 + OpAMD64VPLZCNTQ512 + OpAMD64VPLZCNTQMasked128 + OpAMD64VPLZCNTQMasked256 + OpAMD64VPLZCNTQMasked512 OpAMD64VPMADDUBSW128 OpAMD64VPMADDUBSW256 OpAMD64VPMADDUBSW512 @@ -5343,6 +5355,18 @@ const ( OpIsNanFloat64x2 OpIsNanFloat64x4 OpIsNanFloat64x8 + OpLeadingZerosInt32x4 + OpLeadingZerosInt32x8 + OpLeadingZerosInt32x16 + OpLeadingZerosInt64x2 + OpLeadingZerosInt64x4 + OpLeadingZerosInt64x8 + OpLeadingZerosUint32x4 + OpLeadingZerosUint32x8 + OpLeadingZerosUint32x16 + OpLeadingZerosUint64x2 + OpLeadingZerosUint64x4 + OpLeadingZerosUint64x8 OpLessEqualFloat32x4 OpLessEqualFloat32x8 OpLessEqualFloat32x16 @@ -25897,6 +25921,168 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPLZCNTD128", + argLen: 1, + asm: x86.AVPLZCNTD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTD256", + argLen: 1, + asm: x86.AVPLZCNTD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTD512", + argLen: 1, + asm: x86.AVPLZCNTD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTDMasked128", + argLen: 2, + asm: x86.AVPLZCNTD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTDMasked256", + argLen: 2, + asm: x86.AVPLZCNTD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTDMasked512", + argLen: 2, + asm: x86.AVPLZCNTD, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTQ128", + argLen: 1, + asm: x86.AVPLZCNTQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTQ256", + argLen: 1, + asm: x86.AVPLZCNTQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTQ512", + argLen: 1, + asm: x86.AVPLZCNTQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTQMasked128", + argLen: 2, + asm: x86.AVPLZCNTQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTQMasked256", + argLen: 2, + asm: x86.AVPLZCNTQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPLZCNTQMasked512", + argLen: 2, + asm: x86.AVPLZCNTQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPMADDUBSW128", argLen: 2, @@ -68572,6 +68758,66 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "LeadingZerosInt32x4", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosInt32x8", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosInt32x16", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosInt64x2", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosInt64x4", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosInt64x8", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosUint32x4", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosUint32x8", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosUint32x16", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosUint64x2", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosUint64x4", + argLen: 1, + generic: true, + }, + { + name: "LeadingZerosUint64x8", + argLen: 1, + generic: true, + }, { name: "LessEqualFloat32x4", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 236eed8629..06cafc8e6d 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2489,6 +2489,42 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpIsNonNil(v) case OpIsSliceInBounds: return rewriteValueAMD64_OpIsSliceInBounds(v) + case OpLeadingZerosInt32x16: + v.Op = OpAMD64VPLZCNTD512 + return true + case OpLeadingZerosInt32x4: + v.Op = OpAMD64VPLZCNTD128 + return true + case OpLeadingZerosInt32x8: + v.Op = OpAMD64VPLZCNTD256 + return true + case OpLeadingZerosInt64x2: + v.Op = OpAMD64VPLZCNTQ128 + return true + case OpLeadingZerosInt64x4: + v.Op = OpAMD64VPLZCNTQ256 + return true + case OpLeadingZerosInt64x8: + v.Op = OpAMD64VPLZCNTQ512 + return true + case OpLeadingZerosUint32x16: + v.Op = OpAMD64VPLZCNTD512 + return true + case OpLeadingZerosUint32x4: + v.Op = OpAMD64VPLZCNTD128 + return true + case OpLeadingZerosUint32x8: + v.Op = OpAMD64VPLZCNTD256 + return true + case OpLeadingZerosUint64x2: + v.Op = OpAMD64VPLZCNTQ128 + return true + case OpLeadingZerosUint64x4: + v.Op = OpAMD64VPLZCNTQ256 + return true + case OpLeadingZerosUint64x8: + v.Op = OpAMD64VPLZCNTQ512 + return true case OpLeq16: return rewriteValueAMD64_OpLeq16(v) case OpLeq16U: @@ -27364,6 +27400,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU32Masked512 (VPLZCNTD512 x) mask) + // result: (VPLZCNTDMasked512 x mask) + for { + if v_0.Op != OpAMD64VPLZCNTD512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPLZCNTDMasked512) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU32Masked512 (VMAXPS512 x y) mask) // result: (VMAXPSMasked512 x y mask) for { @@ -28057,6 +28105,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool { v.AddArg3(x, y, mask) return true } + // match: (VMOVDQU64Masked512 (VPLZCNTQ512 x) mask) + // result: (VPLZCNTQMasked512 x mask) + for { + if v_0.Op != OpAMD64VPLZCNTQ512 { + break + } + x := v_0.Args[0] + mask := v_1 + v.reset(OpAMD64VPLZCNTQMasked512) + v.AddArg2(x, mask) + return true + } // match: (VMOVDQU64Masked512 (VMAXPD512 x y) mask) // result: (VMAXPDMasked512 x y mask) for { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index d75dc440d2..4f933de008 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -574,6 +574,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.IsNan", opLen2(ssa.OpIsNanFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.IsNan", opLen2(ssa.OpIsNanFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.IsNan", opLen2(ssa.OpIsNanFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint64x2.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint64x4.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint64x8.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Less", opLen2(ssa.OpLessFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Less", opLen2(ssa.OpLessFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Less", opLen2(ssa.OpLessFloat32x16, types.TypeVec512), sys.AMD64) diff --git a/src/simd/_gen/simdgen/ops/Others/categories.yaml b/src/simd/_gen/simdgen/ops/Others/categories.yaml new file mode 100644 index 0000000000..4489f4f403 --- /dev/null +++ b/src/simd/_gen/simdgen/ops/Others/categories.yaml @@ -0,0 +1,5 @@ +!sum +- go: LeadingZeros + commutative: false + documentation: !string |- + // NAME counts the leading zeros of each element in x. diff --git a/src/simd/_gen/simdgen/ops/Others/go.yaml b/src/simd/_gen/simdgen/ops/Others/go.yaml new file mode 100644 index 0000000000..a4fd87407b --- /dev/null +++ b/src/simd/_gen/simdgen/ops/Others/go.yaml @@ -0,0 +1,8 @@ +!sum +- go: LeadingZeros + asm: "VPLZCNT[DQ]" + in: + - &any + go: $t + out: + - *any diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index 1d4311d75c..0ebd10d147 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -540,3 +540,20 @@ func TestClearAVXUpperBits(t *testing.T) { checkSlices[int64](t, r, []int64{11, 22, 33, 44}) checkSlices[int64](t, s, []int64{9, 18, 27, 36}) } + +func TestLeadingZeros(t *testing.T) { + if !simd.HasAVX512() { + t.Skip("Test requires HasAVX512, not available on this hardware") + return + } + + src := []uint64{0b1111, 0} + want := []uint64{60, 64} + got := make([]uint64, 2) + simd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got) + for i := range 2 { + if want[i] != got[i] { + t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i]) + } + } +} diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 39552131bf..c1d0e8338a 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -3298,6 +3298,68 @@ func (x Float64x4) IsNan(y Float64x4) Mask64x4 // Asm: VCMPPD, CPU Feature: AVX512 func (x Float64x8) IsNan(y Float64x8) Mask64x8 +/* LeadingZeros */ + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTD, CPU Feature: AVX512 +func (x Int32x4) LeadingZeros() Int32x4 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTD, CPU Feature: AVX512 +func (x Int32x8) LeadingZeros() Int32x8 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTD, CPU Feature: AVX512 +func (x Int32x16) LeadingZeros() Int32x16 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTQ, CPU Feature: AVX512 +func (x Int64x2) LeadingZeros() Int64x2 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTQ, CPU Feature: AVX512 +func (x Int64x4) LeadingZeros() Int64x4 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTQ, CPU Feature: AVX512 +func (x Int64x8) LeadingZeros() Int64x8 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTD, CPU Feature: AVX512 +func (x Uint32x4) LeadingZeros() Uint32x4 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTD, CPU Feature: AVX512 +func (x Uint32x8) LeadingZeros() Uint32x8 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTD, CPU Feature: AVX512 +func (x Uint32x16) LeadingZeros() Uint32x16 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTQ, CPU Feature: AVX512 +func (x Uint64x2) LeadingZeros() Uint64x2 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTQ, CPU Feature: AVX512 +func (x Uint64x4) LeadingZeros() Uint64x4 + +// LeadingZeros counts the leading zeros of each element in x. +// +// Asm: VPLZCNTQ, CPU Feature: AVX512 +func (x Uint64x8) LeadingZeros() Uint64x8 + /* Less */ // Less compares for less than. -- 2.52.0