From f1e3651c3324c905b71ca333987650ccf2d1d5d9 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Tue, 9 Sep 2025 16:29:38 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: add VPTEST Change-Id: Ia5103100eca2747fd10917ee2f32e3403e68e844 Reviewed-on: https://go-review.googlesource.com/c/go/+/702175 Reviewed-by: Cherry Mui Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI Reviewed-by: Rob Lee --- src/cmd/compile/internal/amd64/ssa.go | 8 ++ src/cmd/compile/internal/ssa/_gen/AMD64.rules | 3 + src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 36 ++--- .../compile/internal/ssa/_gen/genericOps.go | 3 + src/cmd/compile/internal/ssa/opGen.go | 19 +++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 16 +++ src/cmd/compile/internal/ssagen/intrinsics.go | 16 +++ src/simd/extra_amd64.go | 128 ++++++++++++++++++ src/simd/internal/simd_test/simd_test.go | 23 ++++ 9 files changed, 236 insertions(+), 16 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 47de170ee4..5546ce8d54 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -1845,6 +1845,14 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.From.Reg = v.Args[0].Reg() p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpAMD64VPTEST: + // Some instructions setting flags put their second operand into the destination reg. + // See also CMP[BWDQ]. + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = simdReg(v.Args[0]) + p.To.Type = obj.TYPE_REG + p.To.Reg = simdReg(v.Args[1]) default: if !ssaGenSIMDValue(s, v) { diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index a508395825..3c73737dc0 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1732,6 +1732,9 @@ (StoreMasked64 {t} ptr mask val mem) && t.Size() == 16 => (VPMASK64store128 ptr mask val mem) (StoreMasked64 {t} ptr mask val mem) && t.Size() == 32 => (VPMASK64store256 ptr mask val mem) +// Misc +(IsZeroVec x) => (SETEQ (VPTEST x x)) + // SIMD vector K-masked loads and stores (LoadMasked64 ptr mask mem) && t.Size() == 64 => (VPMASK64load512 ptr (VPMOVVec64x8ToM mask) mem) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index cd538adf90..027b9832ac 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -212,22 +212,23 @@ func init() { vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly} vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}} - v11 = regInfo{inputs: vzonly, outputs: vonly} - v21 = regInfo{inputs: []regMask{vz, vz}, outputs: vonly} - vk = regInfo{inputs: vzonly, outputs: maskonly} - kv = regInfo{inputs: maskonly, outputs: vonly} - v2k = regInfo{inputs: []regMask{vz, vz}, outputs: maskonly} - vkv = regInfo{inputs: []regMask{vz, mask}, outputs: vonly} - v2kv = regInfo{inputs: []regMask{vz, vz, mask}, outputs: vonly} - v2kk = regInfo{inputs: []regMask{vz, vz, mask}, outputs: maskonly} - v31 = regInfo{inputs: []regMask{v, vz, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15 - v3kv = regInfo{inputs: []regMask{v, vz, vz, mask}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15 - vgpv = regInfo{inputs: []regMask{vz, gp}, outputs: vonly} - vgp = regInfo{inputs: vonly, outputs: gponly} - vfpv = regInfo{inputs: []regMask{vz, fp}, outputs: vonly} - vfpkv = regInfo{inputs: []regMask{vz, fp, mask}, outputs: vonly} - fpv = regInfo{inputs: []regMask{fp}, outputs: vonly} - gpv = regInfo{inputs: []regMask{gp}, outputs: vonly} + v11 = regInfo{inputs: vzonly, outputs: vonly} + v21 = regInfo{inputs: []regMask{vz, vz}, outputs: vonly} + vk = regInfo{inputs: vzonly, outputs: maskonly} + kv = regInfo{inputs: maskonly, outputs: vonly} + v2k = regInfo{inputs: []regMask{vz, vz}, outputs: maskonly} + vkv = regInfo{inputs: []regMask{vz, mask}, outputs: vonly} + v2kv = regInfo{inputs: []regMask{vz, vz, mask}, outputs: vonly} + v2kk = regInfo{inputs: []regMask{vz, vz, mask}, outputs: maskonly} + v31 = regInfo{inputs: []regMask{v, vz, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15 + v3kv = regInfo{inputs: []regMask{v, vz, vz, mask}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15 + vgpv = regInfo{inputs: []regMask{vz, gp}, outputs: vonly} + vgp = regInfo{inputs: vonly, outputs: gponly} + vfpv = regInfo{inputs: []regMask{vz, fp}, outputs: vonly} + vfpkv = regInfo{inputs: []regMask{vz, fp, mask}, outputs: vonly} + fpv = regInfo{inputs: []regMask{fp}, outputs: vonly} + gpv = regInfo{inputs: []regMask{gp}, outputs: vonly} + v2flags = regInfo{inputs: []regMask{vz, vz}} w11 = regInfo{inputs: wzonly, outputs: wonly} w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly} @@ -1426,6 +1427,9 @@ func init() { {name: "KMOVDi", argLength: 1, reg: kgp, asm: "KMOVD"}, {name: "KMOVWi", argLength: 1, reg: kgp, asm: "KMOVW"}, {name: "KMOVBi", argLength: 1, reg: kgp, asm: "KMOVB"}, + + // VPTEST + {name: "VPTEST", asm: "VPTEST", argLength: 2, reg: v2flags, clobberFlags: true, typ: "Flags"}, } var AMD64blocks = []blockData{ diff --git a/src/cmd/compile/internal/ssa/_gen/genericOps.go b/src/cmd/compile/internal/ssa/_gen/genericOps.go index 26f3e758bd..188c1c4365 100644 --- a/src/cmd/compile/internal/ssa/_gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go @@ -731,6 +731,9 @@ var genericOps = []opData{ {name: "CvtMask64x2to8", argLength: 1}, // arg0 = mask {name: "CvtMask64x4to8", argLength: 1}, // arg0 = mask {name: "CvtMask64x8to8", argLength: 1}, // arg0 = mask + + // Returns true if arg0 is all zero. + {name: "IsZeroVec", argLength: 1}, } // kind controls successors implicit exit diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 1d2dc46895..531fe991ee 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1236,6 +1236,7 @@ const ( OpAMD64KMOVDi OpAMD64KMOVWi OpAMD64KMOVBi + OpAMD64VPTEST OpAMD64VADDPD128 OpAMD64VADDPD256 OpAMD64VADDPD512 @@ -5390,6 +5391,7 @@ const ( OpCvtMask64x2to8 OpCvtMask64x4to8 OpCvtMask64x8to8 + OpIsZeroVec OpAbsInt8x16 OpAbsInt8x32 OpAbsInt8x64 @@ -19799,6 +19801,18 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPTEST", + argLen: 2, + clobberFlags: true, + asm: x86.AVPTEST, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + }, + }, { name: "VADDPD128", argLen: 2, @@ -75862,6 +75876,11 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "IsZeroVec", + argLen: 1, + generic: true, + }, { name: "AbsInt8x16", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index c0f5b4086a..70b6d549fb 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -3599,6 +3599,8 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpIsNonNil(v) case OpIsSliceInBounds: return rewriteValueAMD64_OpIsSliceInBounds(v) + case OpIsZeroVec: + return rewriteValueAMD64_OpIsZeroVec(v) case OpLeadingZerosInt32x16: v.Op = OpAMD64VPLZCNTD512 return true @@ -53712,6 +53714,20 @@ func rewriteValueAMD64_OpIsSliceInBounds(v *Value) bool { return true } } +func rewriteValueAMD64_OpIsZeroVec(v *Value) bool { + v_0 := v.Args[0] + b := v.Block + // match: (IsZeroVec x) + // result: (SETEQ (VPTEST x x)) + for { + x := v_0 + v.reset(OpAMD64SETEQ) + v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags) + v0.AddArg2(x, x) + v.AddArg(v0) + return true + } +} func rewriteValueAMD64_OpLeq16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 4d1b762f7d..95da078bba 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1614,6 +1614,22 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { return nil }, sys.AMD64) + addF(simdPackage, "Int8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Int16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Int32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Int64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Uint8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Uint16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Uint32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Uint64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Int8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Int16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Int32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Int64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Uint8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Uint16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) + addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) } } diff --git a/src/simd/extra_amd64.go b/src/simd/extra_amd64.go index 6d09f04bbb..a7832e6a57 100644 --- a/src/simd/extra_amd64.go +++ b/src/simd/extra_amd64.go @@ -15,3 +15,131 @@ package simd // // Asm: VZEROUPPER, CPU Feature: AVX func ClearAVXUpperBits() + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Int8x16) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Int8x32) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Int16x8) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Int16x16) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Int32x4) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Int32x8) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Int64x2) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Int64x4) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Uint8x16) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Uint8x32) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Uint16x8) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Uint16x16) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Uint32x4) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Uint32x8) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Uint64x2) IsZero() bool + +// IsZero returns true if all elements of x are zeros. +// +// This method compiles to VPTEST x, x. +// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y +// +// Asm: VPTEST, CPU Feature: AVX +func (x Uint64x4) IsZero() bool diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index 0ebd10d147..e43bea1e12 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -557,3 +557,26 @@ func TestLeadingZeros(t *testing.T) { } } } + +func TestIsZero(t *testing.T) { + v1 := simd.LoadUint64x2Slice([]uint64{0, 1}) + v2 := simd.LoadUint64x2Slice([]uint64{0, 0}) + if v1.IsZero() { + t.Errorf("Result incorrect, want false, got true") + } + if !v2.IsZero() { + t.Errorf("Result incorrect, want true, got false") + } + if !v1.And(v2).IsZero() { + t.Errorf("Result incorrect, want true, got false") + } + if v1.AndNot(v2).IsZero() { + t.Errorf("Result incorrect, want false, got true") + } + if !v2.And(v1).IsZero() { + t.Errorf("Result incorrect, want true, got false") + } + if !v2.AndNot(v1).IsZero() { + t.Errorf("Result incorrect, want true, got false") + } +} -- 2.52.0