From e84983fa40a6e97d3e169f1f3549af889b1b1f22 Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Fri, 2 Jan 2026 14:02:07 -0500 Subject: [PATCH] cmd/compile: optimize SIMD IsNaN.Or(IsNaN) IsNaN's underlying instruction, VCMPPS (or VCMPPD), takes two inputs, and computes either of them is NaN. Optimize the Or pattern to generate two-operand form. This implements the optimization mentioned in CL 733660. Change-Id: I13943b377ee384864c913eed320763f333a03e41 Reviewed-on: https://go-review.googlesource.com/c/go/+/733680 Reviewed-by: David Chase LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/ssa/_gen/AMD64.rules | 8 + src/cmd/compile/internal/ssa/rewriteAMD64.go | 168 ++++++++++++++++++ .../internal/simd_test/compare_test.go | 34 ++++ test/codegen/simd.go | 31 +++- 4 files changed, 239 insertions(+), 2 deletions(-) diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64.rules b/src/cmd/compile/internal/ssa/_gen/AMD64.rules index 9c54186854..9cd23c6286 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules @@ -1824,3 +1824,11 @@ (EQ (VPTEST x:(VPAND(D|Q)512 j k) y) yes no) && x == y && x.Uses == 2 => (EQ (VPTEST j k) yes no) (EQ (VPTEST x:(VPANDN(128|256) j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order (EQ (VPTEST x:(VPANDN(D|Q)512 j k) y) yes no) && x == y && x.Uses == 2 => (ULT (VPTEST k j) yes no) // AndNot has swapped its operand order + +// optimize x.IsNaN().Or(y.IsNaN()) +(VPOR128 (VCMPP(S|D)128 [3] x x) (VCMPP(S|D)128 [3] y y)) => (VCMPP(S|D)128 [3] x y) +(VPOR256 (VCMPP(S|D)256 [3] x x) (VCMPP(S|D)256 [3] y y)) => (VCMPP(S|D)256 [3] x y) +(VPORD512 (VPMOVMToVec32x16 (VCMPPS512 [3] x x)) (VPMOVMToVec32x16 (VCMPPS512 [3] y y))) => + (VPMOVMToVec32x16 (VCMPPS512 [3] x y)) +(VPORD512 (VPMOVMToVec64x8 (VCMPPD512 [3] x x)) (VPMOVMToVec64x8 (VCMPPD512 [3] y y))) => + (VPMOVMToVec64x8 (VCMPPD512 [3] x y)) diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 0b2bb74ce4..3eb2a6278b 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1382,6 +1382,10 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64VPOPCNTQMasked256(v) case OpAMD64VPOPCNTQMasked512: return rewriteValueAMD64_OpAMD64VPOPCNTQMasked512(v) + case OpAMD64VPOR128: + return rewriteValueAMD64_OpAMD64VPOR128(v) + case OpAMD64VPOR256: + return rewriteValueAMD64_OpAMD64VPOR256(v) case OpAMD64VPORD512: return rewriteValueAMD64_OpAMD64VPORD512(v) case OpAMD64VPORDMasked128: @@ -56768,9 +56772,173 @@ func rewriteValueAMD64_OpAMD64VPOPCNTQMasked512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VPOR128(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPOR128 (VCMPPS128 [3] x x) (VCMPPS128 [3] y y)) + // result: (VCMPPS128 [3] x y) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpAMD64VCMPPS128 || auxIntToUint8(v_0.AuxInt) != 3 { + continue + } + x := v_0.Args[1] + if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPS128 || auxIntToUint8(v_1.AuxInt) != 3 { + continue + } + y := v_1.Args[1] + if y != v_1.Args[0] { + continue + } + v.reset(OpAMD64VCMPPS128) + v.AuxInt = uint8ToAuxInt(3) + v.AddArg2(x, y) + return true + } + break + } + // match: (VPOR128 (VCMPPD128 [3] x x) (VCMPPD128 [3] y y)) + // result: (VCMPPD128 [3] x y) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpAMD64VCMPPD128 || auxIntToUint8(v_0.AuxInt) != 3 { + continue + } + x := v_0.Args[1] + if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPD128 || auxIntToUint8(v_1.AuxInt) != 3 { + continue + } + y := v_1.Args[1] + if y != v_1.Args[0] { + continue + } + v.reset(OpAMD64VCMPPD128) + v.AuxInt = uint8ToAuxInt(3) + v.AddArg2(x, y) + return true + } + break + } + return false +} +func rewriteValueAMD64_OpAMD64VPOR256(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPOR256 (VCMPPS256 [3] x x) (VCMPPS256 [3] y y)) + // result: (VCMPPS256 [3] x y) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpAMD64VCMPPS256 || auxIntToUint8(v_0.AuxInt) != 3 { + continue + } + x := v_0.Args[1] + if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPS256 || auxIntToUint8(v_1.AuxInt) != 3 { + continue + } + y := v_1.Args[1] + if y != v_1.Args[0] { + continue + } + v.reset(OpAMD64VCMPPS256) + v.AuxInt = uint8ToAuxInt(3) + v.AddArg2(x, y) + return true + } + break + } + // match: (VPOR256 (VCMPPD256 [3] x x) (VCMPPD256 [3] y y)) + // result: (VCMPPD256 [3] x y) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpAMD64VCMPPD256 || auxIntToUint8(v_0.AuxInt) != 3 { + continue + } + x := v_0.Args[1] + if x != v_0.Args[0] || v_1.Op != OpAMD64VCMPPD256 || auxIntToUint8(v_1.AuxInt) != 3 { + continue + } + y := v_1.Args[1] + if y != v_1.Args[0] { + continue + } + v.reset(OpAMD64VCMPPD256) + v.AuxInt = uint8ToAuxInt(3) + v.AddArg2(x, y) + return true + } + break + } + return false +} func rewriteValueAMD64_OpAMD64VPORD512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types + // match: (VPORD512 (VPMOVMToVec32x16 (VCMPPS512 [3] x x)) (VPMOVMToVec32x16 (VCMPPS512 [3] y y))) + // result: (VPMOVMToVec32x16 (VCMPPS512 [3] x y)) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpAMD64VPMOVMToVec32x16 { + continue + } + v_0_0 := v_0.Args[0] + if v_0_0.Op != OpAMD64VCMPPS512 || auxIntToUint8(v_0_0.AuxInt) != 3 { + continue + } + x := v_0_0.Args[1] + if x != v_0_0.Args[0] || v_1.Op != OpAMD64VPMOVMToVec32x16 { + continue + } + v_1_0 := v_1.Args[0] + if v_1_0.Op != OpAMD64VCMPPS512 || auxIntToUint8(v_1_0.AuxInt) != 3 { + continue + } + y := v_1_0.Args[1] + if y != v_1_0.Args[0] { + continue + } + v.reset(OpAMD64VPMOVMToVec32x16) + v0 := b.NewValue0(v.Pos, OpAMD64VCMPPS512, typ.Mask) + v0.AuxInt = uint8ToAuxInt(3) + v0.AddArg2(x, y) + v.AddArg(v0) + return true + } + break + } + // match: (VPORD512 (VPMOVMToVec64x8 (VCMPPD512 [3] x x)) (VPMOVMToVec64x8 (VCMPPD512 [3] y y))) + // result: (VPMOVMToVec64x8 (VCMPPD512 [3] x y)) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpAMD64VPMOVMToVec64x8 { + continue + } + v_0_0 := v_0.Args[0] + if v_0_0.Op != OpAMD64VCMPPD512 || auxIntToUint8(v_0_0.AuxInt) != 3 { + continue + } + x := v_0_0.Args[1] + if x != v_0_0.Args[0] || v_1.Op != OpAMD64VPMOVMToVec64x8 { + continue + } + v_1_0 := v_1.Args[0] + if v_1_0.Op != OpAMD64VCMPPD512 || auxIntToUint8(v_1_0.AuxInt) != 3 { + continue + } + y := v_1_0.Args[1] + if y != v_1_0.Args[0] { + continue + } + v.reset(OpAMD64VPMOVMToVec64x8) + v0 := b.NewValue0(v.Pos, OpAMD64VCMPPD512, typ.Mask) + v0.AuxInt = uint8ToAuxInt(3) + v0.AddArg2(x, y) + v.AddArg(v0) + return true + } + break + } // match: (VPORD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) // cond: canMergeLoad(v, l) && clobber(l) // result: (VPORD512load {sym} [off] x ptr mem) diff --git a/src/simd/archsimd/internal/simd_test/compare_test.go b/src/simd/archsimd/internal/simd_test/compare_test.go index e678676be0..ea8514ac93 100644 --- a/src/simd/archsimd/internal/simd_test/compare_test.go +++ b/src/simd/archsimd/internal/simd_test/compare_test.go @@ -309,4 +309,38 @@ func TestIsNaN(t *testing.T) { testFloat32x16UnaryCompare(t, archsimd.Float32x16.IsNaN, isNaNSlice[float32]) testFloat64x8UnaryCompare(t, archsimd.Float64x8.IsNaN, isNaNSlice[float64]) } + + // Test x.IsNaN().Or(y.IsNaN()), which is optimized to VCMPP(S|D) $3, x, y. + want32 := mapCompare(func(x, y float32) bool { return x != x || y != y }) + want64 := mapCompare(func(x, y float64) bool { return x != x || y != y }) + testFloat32x4Compare(t, + func(x, y archsimd.Float32x4) archsimd.Mask32x4 { + return x.IsNaN().Or(y.IsNaN()) + }, want32) + testFloat64x2Compare(t, + func(x, y archsimd.Float64x2) archsimd.Mask64x2 { + return x.IsNaN().Or(y.IsNaN()) + }, want64) + + if archsimd.X86.AVX2() { + testFloat32x8Compare(t, + func(x, y archsimd.Float32x8) archsimd.Mask32x8 { + return x.IsNaN().Or(y.IsNaN()) + }, want32) + testFloat64x4Compare(t, + func(x, y archsimd.Float64x4) archsimd.Mask64x4 { + return x.IsNaN().Or(y.IsNaN()) + }, want64) + } + + if archsimd.X86.AVX512() { + testFloat32x16Compare(t, + func(x, y archsimd.Float32x16) archsimd.Mask32x16 { + return x.IsNaN().Or(y.IsNaN()) + }, want32) + testFloat64x8Compare(t, + func(x, y archsimd.Float64x8) archsimd.Mask64x8 { + return x.IsNaN().Or(y.IsNaN()) + }, want64) + } } diff --git a/test/codegen/simd.go b/test/codegen/simd.go index 8f3a1a9f46..04e01944de 100644 --- a/test/codegen/simd.go +++ b/test/codegen/simd.go @@ -6,11 +6,14 @@ // These tests check code generation of simd peephole optimizations. -//go:build goexperiment.simd +//go:build goexperiment.simd && amd64 package codegen -import "simd/archsimd" +import ( + "math" + "simd/archsimd" +) func vptest1() bool { v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1}) @@ -77,3 +80,27 @@ func simdMaskedMerge() archsimd.Int16x16 { mask := archsimd.Mask16x16FromBits(5) return x.Add(y).Merge(x, mask) // amd64:`VPBLENDVB\s.*$` } + +var nan = math.NaN() +var floats64s = []float64{0, 1, 2, nan, 4, nan, 6, 7, 8, 9, 10, 11, nan, 13, 14, 15} +var sinkInt64s = make([]int64, 100) + +func simdIsNaN() { + x := archsimd.LoadFloat64x4Slice(floats64s) + y := archsimd.LoadFloat64x4Slice(floats64s[4:]) + a := x.IsNaN() + b := y.IsNaN() + // amd64:"VCMPPD [$]3," -"VPOR" + c := a.Or(b) + c.ToInt64x4().StoreSlice(sinkInt64s) +} + +func simdIsNaN512() { + x := archsimd.LoadFloat64x8Slice(floats64s) + y := archsimd.LoadFloat64x8Slice(floats64s[8:]) + a := x.IsNaN() + b := y.IsNaN() + // amd64:"VCMPPD [$]3," -"VPOR" + c := a.Or(b) + c.ToInt64x8().StoreSlice(sinkInt64s) +} -- 2.52.0