]> Cypherpunks repositories - gostls13.git/commitdiff
[dev.simd] cmd/compile, simd: add VPLZCNT[DQ]
authorJunyang Shao <shaojunyang@google.com>
Mon, 8 Sep 2025 19:38:56 +0000 (19:38 +0000)
committerJunyang Shao <shaojunyang@google.com>
Mon, 8 Sep 2025 20:53:52 +0000 (13:53 -0700)
Change-Id: Ifd6d8c12deac9c41722fdf2511d860a334e83438
Reviewed-on: https://go-review.googlesource.com/c/go/+/701915
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Bypass: Junyang Shao <shaojunyang@google.com>

src/cmd/compile/internal/amd64/simdssa.go
src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
src/cmd/compile/internal/ssagen/simdintrinsics.go
src/simd/_gen/simdgen/ops/Others/categories.yaml [new file with mode: 0644]
src/simd/_gen/simdgen/ops/Others/go.yaml [new file with mode: 0644]
src/simd/internal/simd_test/simd_test.go
src/simd/ops_amd64.go

index 33f66693004b72e9df007f0e37b0523df499b863..1c289507e193d069fdd5e008f473c91f78df1f44 100644 (file)
@@ -110,6 +110,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPMOVZXBQ256,
                ssa.OpAMD64VPMOVZXWQ256,
                ssa.OpAMD64VPMOVZXBQ512,
+               ssa.OpAMD64VPLZCNTD128,
+               ssa.OpAMD64VPLZCNTD256,
+               ssa.OpAMD64VPLZCNTD512,
+               ssa.OpAMD64VPLZCNTQ128,
+               ssa.OpAMD64VPLZCNTQ256,
+               ssa.OpAMD64VPLZCNTQ512,
                ssa.OpAMD64VPOPCNTB128,
                ssa.OpAMD64VPOPCNTB256,
                ssa.OpAMD64VPOPCNTB512,
@@ -863,6 +869,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VPEXPANDQMasked128,
                ssa.OpAMD64VPEXPANDQMasked256,
                ssa.OpAMD64VPEXPANDQMasked512,
+               ssa.OpAMD64VPLZCNTDMasked128,
+               ssa.OpAMD64VPLZCNTDMasked256,
+               ssa.OpAMD64VPLZCNTDMasked512,
+               ssa.OpAMD64VPLZCNTQMasked128,
+               ssa.OpAMD64VPLZCNTQMasked256,
+               ssa.OpAMD64VPLZCNTQMasked512,
                ssa.OpAMD64VPOPCNTBMasked128,
                ssa.OpAMD64VPOPCNTBMasked256,
                ssa.OpAMD64VPOPCNTBMasked512,
@@ -1581,6 +1593,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
                ssa.OpAMD64VGF2P8MULBMasked128,
                ssa.OpAMD64VGF2P8MULBMasked256,
                ssa.OpAMD64VGF2P8MULBMasked512,
+               ssa.OpAMD64VPLZCNTDMasked128,
+               ssa.OpAMD64VPLZCNTDMasked256,
+               ssa.OpAMD64VPLZCNTDMasked512,
+               ssa.OpAMD64VPLZCNTQMasked128,
+               ssa.OpAMD64VPLZCNTQMasked256,
+               ssa.OpAMD64VPLZCNTQMasked512,
                ssa.OpAMD64VMAXPSMasked128,
                ssa.OpAMD64VMAXPSMasked256,
                ssa.OpAMD64VMAXPSMasked512,
index 35ef1d35b66235a431014d49437d6837a49e8d55..bfedad1e9ba8e54f5ca36535c38484293ef8648c 100644 (file)
 (IsNanFloat64x2 x y) => (VCMPPD128 [3] x y)
 (IsNanFloat64x4 x y) => (VCMPPD256 [3] x y)
 (IsNanFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [3] x y))
+(LeadingZerosInt32x4 ...) => (VPLZCNTD128 ...)
+(LeadingZerosInt32x8 ...) => (VPLZCNTD256 ...)
+(LeadingZerosInt32x16 ...) => (VPLZCNTD512 ...)
+(LeadingZerosInt64x2 ...) => (VPLZCNTQ128 ...)
+(LeadingZerosInt64x4 ...) => (VPLZCNTQ256 ...)
+(LeadingZerosInt64x8 ...) => (VPLZCNTQ512 ...)
+(LeadingZerosUint32x4 ...) => (VPLZCNTD128 ...)
+(LeadingZerosUint32x8 ...) => (VPLZCNTD256 ...)
+(LeadingZerosUint32x16 ...) => (VPLZCNTD512 ...)
+(LeadingZerosUint64x2 ...) => (VPLZCNTQ128 ...)
+(LeadingZerosUint64x4 ...) => (VPLZCNTQ256 ...)
+(LeadingZerosUint64x8 ...) => (VPLZCNTQ512 ...)
 (LessFloat32x4 x y) => (VCMPPS128 [1] x y)
 (LessFloat32x8 x y) => (VCMPPS256 [1] x y)
 (LessFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [1] x y))
 (VMOVDQU8Masked512 (VGF2P8AFFINEINVQB512 [a] x y) mask) => (VGF2P8AFFINEINVQBMasked512 [a] x y mask)
 (VMOVDQU8Masked512 (VGF2P8AFFINEQB512 [a] x y) mask) => (VGF2P8AFFINEQBMasked512 [a] x y mask)
 (VMOVDQU8Masked512 (VGF2P8MULB512 x y) mask) => (VGF2P8MULBMasked512 x y mask)
+(VMOVDQU32Masked512 (VPLZCNTD512 x) mask) => (VPLZCNTDMasked512 x mask)
+(VMOVDQU64Masked512 (VPLZCNTQ512 x) mask) => (VPLZCNTQMasked512 x mask)
 (VMOVDQU32Masked512 (VMAXPS512 x y) mask) => (VMAXPSMasked512 x y mask)
 (VMOVDQU64Masked512 (VMAXPD512 x y) mask) => (VMAXPDMasked512 x y mask)
 (VMOVDQU8Masked512 (VPMAXSB512 x y) mask) => (VPMAXSBMasked512 x y mask)
index 1448f8776a7d5adea6be01a75de424922a8c313e..9143f25bcad58e8efe3c985c3226d798dc49f4b2 100644 (file)
@@ -450,6 +450,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
                {name: "VPHSUBSW256", argLength: 2, reg: v21, asm: "VPHSUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPHSUBW128", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPHSUBW256", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPLZCNTD128", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPLZCNTD256", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPLZCNTD512", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPLZCNTDMasked128", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPLZCNTDMasked256", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPLZCNTDMasked512", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPLZCNTQ128", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPLZCNTQ256", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPLZCNTQ512", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec512", resultInArg0: false},
+               {name: "VPLZCNTQMasked128", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec128", resultInArg0: false},
+               {name: "VPLZCNTQMasked256", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec256", resultInArg0: false},
+               {name: "VPLZCNTQMasked512", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec512", resultInArg0: false},
                {name: "VPMADDUBSW128", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
                {name: "VPMADDUBSW256", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
                {name: "VPMADDUBSW512", argLength: 2, reg: w21, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},
index 11c5785f7ddb5e2b352708067d8e3d06c09ae09e..7ee4989d892b8f93c401808c4ac4555cb67cf04f 100644 (file)
@@ -526,6 +526,18 @@ func simdGenericOps() []opData {
                {name: "IsNanFloat64x2", argLength: 2, commutative: true},
                {name: "IsNanFloat64x4", argLength: 2, commutative: true},
                {name: "IsNanFloat64x8", argLength: 2, commutative: true},
+               {name: "LeadingZerosInt32x4", argLength: 1, commutative: false},
+               {name: "LeadingZerosInt32x8", argLength: 1, commutative: false},
+               {name: "LeadingZerosInt32x16", argLength: 1, commutative: false},
+               {name: "LeadingZerosInt64x2", argLength: 1, commutative: false},
+               {name: "LeadingZerosInt64x4", argLength: 1, commutative: false},
+               {name: "LeadingZerosInt64x8", argLength: 1, commutative: false},
+               {name: "LeadingZerosUint32x4", argLength: 1, commutative: false},
+               {name: "LeadingZerosUint32x8", argLength: 1, commutative: false},
+               {name: "LeadingZerosUint32x16", argLength: 1, commutative: false},
+               {name: "LeadingZerosUint64x2", argLength: 1, commutative: false},
+               {name: "LeadingZerosUint64x4", argLength: 1, commutative: false},
+               {name: "LeadingZerosUint64x8", argLength: 1, commutative: false},
                {name: "LessEqualFloat32x4", argLength: 2, commutative: false},
                {name: "LessEqualFloat32x8", argLength: 2, commutative: false},
                {name: "LessEqualFloat32x16", argLength: 2, commutative: false},
index 9fc60598656beb3a34dcf63896258af621ae60a9..8719602036b19cb1d19638648fdc41f440be5ec3 100644 (file)
@@ -1682,6 +1682,18 @@ const (
        OpAMD64VPHSUBSW256
        OpAMD64VPHSUBW128
        OpAMD64VPHSUBW256
+       OpAMD64VPLZCNTD128
+       OpAMD64VPLZCNTD256
+       OpAMD64VPLZCNTD512
+       OpAMD64VPLZCNTDMasked128
+       OpAMD64VPLZCNTDMasked256
+       OpAMD64VPLZCNTDMasked512
+       OpAMD64VPLZCNTQ128
+       OpAMD64VPLZCNTQ256
+       OpAMD64VPLZCNTQ512
+       OpAMD64VPLZCNTQMasked128
+       OpAMD64VPLZCNTQMasked256
+       OpAMD64VPLZCNTQMasked512
        OpAMD64VPMADDUBSW128
        OpAMD64VPMADDUBSW256
        OpAMD64VPMADDUBSW512
@@ -5343,6 +5355,18 @@ const (
        OpIsNanFloat64x2
        OpIsNanFloat64x4
        OpIsNanFloat64x8
+       OpLeadingZerosInt32x4
+       OpLeadingZerosInt32x8
+       OpLeadingZerosInt32x16
+       OpLeadingZerosInt64x2
+       OpLeadingZerosInt64x4
+       OpLeadingZerosInt64x8
+       OpLeadingZerosUint32x4
+       OpLeadingZerosUint32x8
+       OpLeadingZerosUint32x16
+       OpLeadingZerosUint64x2
+       OpLeadingZerosUint64x4
+       OpLeadingZerosUint64x8
        OpLessEqualFloat32x4
        OpLessEqualFloat32x8
        OpLessEqualFloat32x16
@@ -25897,6 +25921,168 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "VPLZCNTD128",
+               argLen: 1,
+               asm:    x86.AVPLZCNTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTD256",
+               argLen: 1,
+               asm:    x86.AVPLZCNTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTD512",
+               argLen: 1,
+               asm:    x86.AVPLZCNTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTDMasked128",
+               argLen: 2,
+               asm:    x86.AVPLZCNTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTDMasked256",
+               argLen: 2,
+               asm:    x86.AVPLZCNTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTDMasked512",
+               argLen: 2,
+               asm:    x86.AVPLZCNTD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTQ128",
+               argLen: 1,
+               asm:    x86.AVPLZCNTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTQ256",
+               argLen: 1,
+               asm:    x86.AVPLZCNTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTQ512",
+               argLen: 1,
+               asm:    x86.AVPLZCNTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTQMasked128",
+               argLen: 2,
+               asm:    x86.AVPLZCNTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTQMasked256",
+               argLen: 2,
+               asm:    x86.AVPLZCNTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
+       {
+               name:   "VPLZCNTQMasked512",
+               argLen: 2,
+               asm:    x86.AVPLZCNTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+                               {0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+                       outputs: []outputInfo{
+                               {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+                       },
+               },
+       },
        {
                name:   "VPMADDUBSW128",
                argLen: 2,
@@ -68572,6 +68758,66 @@ var opcodeTable = [...]opInfo{
                commutative: true,
                generic:     true,
        },
+       {
+               name:    "LeadingZerosInt32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosInt32x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosInt32x16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosInt64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosInt64x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosInt64x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosUint32x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosUint32x8",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosUint32x16",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosUint64x2",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosUint64x4",
+               argLen:  1,
+               generic: true,
+       },
+       {
+               name:    "LeadingZerosUint64x8",
+               argLen:  1,
+               generic: true,
+       },
        {
                name:    "LessEqualFloat32x4",
                argLen:  2,
index 236eed8629f1de6b8a6b36ce311928bedb26a767..06cafc8e6d12b7c006263b00991489664546d96d 100644 (file)
@@ -2489,6 +2489,42 @@ func rewriteValueAMD64(v *Value) bool {
                return rewriteValueAMD64_OpIsNonNil(v)
        case OpIsSliceInBounds:
                return rewriteValueAMD64_OpIsSliceInBounds(v)
+       case OpLeadingZerosInt32x16:
+               v.Op = OpAMD64VPLZCNTD512
+               return true
+       case OpLeadingZerosInt32x4:
+               v.Op = OpAMD64VPLZCNTD128
+               return true
+       case OpLeadingZerosInt32x8:
+               v.Op = OpAMD64VPLZCNTD256
+               return true
+       case OpLeadingZerosInt64x2:
+               v.Op = OpAMD64VPLZCNTQ128
+               return true
+       case OpLeadingZerosInt64x4:
+               v.Op = OpAMD64VPLZCNTQ256
+               return true
+       case OpLeadingZerosInt64x8:
+               v.Op = OpAMD64VPLZCNTQ512
+               return true
+       case OpLeadingZerosUint32x16:
+               v.Op = OpAMD64VPLZCNTD512
+               return true
+       case OpLeadingZerosUint32x4:
+               v.Op = OpAMD64VPLZCNTD128
+               return true
+       case OpLeadingZerosUint32x8:
+               v.Op = OpAMD64VPLZCNTD256
+               return true
+       case OpLeadingZerosUint64x2:
+               v.Op = OpAMD64VPLZCNTQ128
+               return true
+       case OpLeadingZerosUint64x4:
+               v.Op = OpAMD64VPLZCNTQ256
+               return true
+       case OpLeadingZerosUint64x8:
+               v.Op = OpAMD64VPLZCNTQ512
+               return true
        case OpLeq16:
                return rewriteValueAMD64_OpLeq16(v)
        case OpLeq16U:
@@ -27364,6 +27400,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
                v.AddArg3(x, y, mask)
                return true
        }
+       // match: (VMOVDQU32Masked512 (VPLZCNTD512 x) mask)
+       // result: (VPLZCNTDMasked512 x mask)
+       for {
+               if v_0.Op != OpAMD64VPLZCNTD512 {
+                       break
+               }
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPLZCNTDMasked512)
+               v.AddArg2(x, mask)
+               return true
+       }
        // match: (VMOVDQU32Masked512 (VMAXPS512 x y) mask)
        // result: (VMAXPSMasked512 x y mask)
        for {
@@ -28057,6 +28105,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool {
                v.AddArg3(x, y, mask)
                return true
        }
+       // match: (VMOVDQU64Masked512 (VPLZCNTQ512 x) mask)
+       // result: (VPLZCNTQMasked512 x mask)
+       for {
+               if v_0.Op != OpAMD64VPLZCNTQ512 {
+                       break
+               }
+               x := v_0.Args[0]
+               mask := v_1
+               v.reset(OpAMD64VPLZCNTQMasked512)
+               v.AddArg2(x, mask)
+               return true
+       }
        // match: (VMOVDQU64Masked512 (VMAXPD512 x y) mask)
        // result: (VMAXPDMasked512 x y mask)
        for {
index d75dc440d2bb0bd3a82eaeccc9806833155adb30..4f933de008dfaaf0e9de66616b19bfc7288acb49 100644 (file)
@@ -574,6 +574,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
        addF(simdPackage, "Float64x2.IsNan", opLen2(ssa.OpIsNanFloat64x2, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float64x4.IsNan", opLen2(ssa.OpIsNanFloat64x4, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Float64x8.IsNan", opLen2(ssa.OpIsNanFloat64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Int64x2.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Int64x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Int64x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x8, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x4, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x8, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x16, types.TypeVec512), sys.AMD64)
+       addF(simdPackage, "Uint64x2.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x2, types.TypeVec128), sys.AMD64)
+       addF(simdPackage, "Uint64x4.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x4, types.TypeVec256), sys.AMD64)
+       addF(simdPackage, "Uint64x8.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x8, types.TypeVec512), sys.AMD64)
        addF(simdPackage, "Float32x4.Less", opLen2(ssa.OpLessFloat32x4, types.TypeVec128), sys.AMD64)
        addF(simdPackage, "Float32x8.Less", opLen2(ssa.OpLessFloat32x8, types.TypeVec256), sys.AMD64)
        addF(simdPackage, "Float32x16.Less", opLen2(ssa.OpLessFloat32x16, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/_gen/simdgen/ops/Others/categories.yaml b/src/simd/_gen/simdgen/ops/Others/categories.yaml
new file mode 100644 (file)
index 0000000..4489f4f
--- /dev/null
@@ -0,0 +1,5 @@
+!sum
+- go: LeadingZeros
+  commutative: false
+  documentation: !string |-
+    // NAME counts the leading zeros of each element in x.
diff --git a/src/simd/_gen/simdgen/ops/Others/go.yaml b/src/simd/_gen/simdgen/ops/Others/go.yaml
new file mode 100644 (file)
index 0000000..a4fd874
--- /dev/null
@@ -0,0 +1,8 @@
+!sum
+- go: LeadingZeros
+  asm: "VPLZCNT[DQ]"
+  in:
+  - &any
+    go: $t
+  out:
+  - *any
index 1d4311d75c52fde439dd59a09027844b85bc9965..0ebd10d14734282bae9b38ca9ca98d826a546e82 100644 (file)
@@ -540,3 +540,20 @@ func TestClearAVXUpperBits(t *testing.T) {
        checkSlices[int64](t, r, []int64{11, 22, 33, 44})
        checkSlices[int64](t, s, []int64{9, 18, 27, 36})
 }
+
+func TestLeadingZeros(t *testing.T) {
+       if !simd.HasAVX512() {
+               t.Skip("Test requires HasAVX512, not available on this hardware")
+               return
+       }
+
+       src := []uint64{0b1111, 0}
+       want := []uint64{60, 64}
+       got := make([]uint64, 2)
+       simd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got)
+       for i := range 2 {
+               if want[i] != got[i] {
+                       t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
+               }
+       }
+}
index 39552131bffb53875b8937e744d29f588ea21a67..c1d0e8338a01cafcb27b3eb0e9321b8b823ed289 100644 (file)
@@ -3298,6 +3298,68 @@ func (x Float64x4) IsNan(y Float64x4) Mask64x4
 // Asm: VCMPPD, CPU Feature: AVX512
 func (x Float64x8) IsNan(y Float64x8) Mask64x8
 
+/* LeadingZeros */
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Int32x4) LeadingZeros() Int32x4
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Int32x8) LeadingZeros() Int32x8
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Int32x16) LeadingZeros() Int32x16
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Int64x2) LeadingZeros() Int64x2
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Int64x4) LeadingZeros() Int64x4
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Int64x8) LeadingZeros() Int64x8
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Uint32x4) LeadingZeros() Uint32x4
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Uint32x8) LeadingZeros() Uint32x8
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Uint32x16) LeadingZeros() Uint32x16
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Uint64x2) LeadingZeros() Uint64x2
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Uint64x4) LeadingZeros() Uint64x4
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Uint64x8) LeadingZeros() Uint64x8
+
 /* Less */
 
 // Less compares for less than.