From bd8a39b67a12ec3d271305105dea3b8521aa70bf Mon Sep 17 00:00:00 2001 From: Michael Munday Date: Sun, 12 Feb 2017 22:12:12 -0500 Subject: [PATCH] cmd/compile: emit fused multiply-{add,subtract} instructions on s390x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Explcitly block fused multiply-add pattern matching when a cast is used after the multiplication, for example: - (a * b) + c // can emit fused multiply-add - float64(a * b) + c // cannot emit fused multiply-add float{32,64} and complex{64,128} casts of matching types are now kept as OCONV operations rather than being replaced with OCONVNOP operations because they now imply a rounding operation (and therefore aren't a no-op anymore). Operations (for example, multiplication) on complex types may utilize fused multiply-add and -subtract instructions internally. There is no way to disable this behavior at the moment. Improves the performance of the floating point implementation of poly1305: name old speed new speed delta 64 246MB/s ± 0% 275MB/s ± 0% +11.48% (p=0.000 n=10+8) 1K 312MB/s ± 0% 357MB/s ± 0% +14.41% (p=0.000 n=10+10) 64Unaligned 246MB/s ± 0% 274MB/s ± 0% +11.43% (p=0.000 n=10+10) 1KUnaligned 312MB/s ± 0% 357MB/s ± 0% +14.39% (p=0.000 n=10+8) Updates #17895. Change-Id: Ia771d275bb9150d1a598f8cc773444663de5ce16 Reviewed-on: https://go-review.googlesource.com/36963 Run-TryBot: Michael Munday TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/cmd/compile/internal/gc/asm_test.go | 33 +++ src/cmd/compile/internal/gc/const.go | 1 + src/cmd/compile/internal/gc/ssa.go | 13 +- src/cmd/compile/internal/gc/testdata/fp.go | 137 ++++++++++++ src/cmd/compile/internal/gc/typecheck.go | 7 + src/cmd/compile/internal/s390x/prog.go | 34 +-- src/cmd/compile/internal/s390x/ssa.go | 16 ++ src/cmd/compile/internal/ssa/gen/386.rules | 3 + src/cmd/compile/internal/ssa/gen/AMD64.rules | 3 + src/cmd/compile/internal/ssa/gen/ARM.rules | 3 + src/cmd/compile/internal/ssa/gen/ARM64.rules | 3 + src/cmd/compile/internal/ssa/gen/MIPS.rules | 3 + src/cmd/compile/internal/ssa/gen/MIPS64.rules | 3 + src/cmd/compile/internal/ssa/gen/PPC64.rules | 3 + src/cmd/compile/internal/ssa/gen/S390X.rules | 13 ++ src/cmd/compile/internal/ssa/gen/S390XOps.go | 28 ++- .../compile/internal/ssa/gen/generic.rules | 2 + .../compile/internal/ssa/gen/genericOps.go | 4 + src/cmd/compile/internal/ssa/opGen.go | 108 ++++++++++ src/cmd/compile/internal/ssa/rewrite386.go | 32 +++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 32 +++ src/cmd/compile/internal/ssa/rewriteARM.go | 32 +++ src/cmd/compile/internal/ssa/rewriteARM64.go | 32 +++ src/cmd/compile/internal/ssa/rewriteMIPS.go | 32 +++ src/cmd/compile/internal/ssa/rewriteMIPS64.go | 32 +++ src/cmd/compile/internal/ssa/rewritePPC64.go | 32 +++ src/cmd/compile/internal/ssa/rewriteS390X.go | 200 ++++++++++++++++++ .../compile/internal/ssa/rewritegeneric.go | 40 ++++ 28 files changed, 853 insertions(+), 28 deletions(-) diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go index 76c4f640b9..4525ba3f0c 100644 --- a/src/cmd/compile/internal/gc/asm_test.go +++ b/src/cmd/compile/internal/gc/asm_test.go @@ -638,6 +638,39 @@ var linuxS390XTests = []*asmTest{ `, []string{"\tRLL\t[$]7,"}, }, + // Fused multiply-add/sub instructions. + { + ` + func f14(x, y, z float64) float64 { + return x * y + z + } + `, + []string{"\tFMADD\t"}, + }, + { + ` + func f15(x, y, z float64) float64 { + return x * y - z + } + `, + []string{"\tFMSUB\t"}, + }, + { + ` + func f16(x, y, z float32) float32 { + return x * y + z + } + `, + []string{"\tFMADDS\t"}, + }, + { + ` + func f17(x, y, z float32) float32 { + return x * y - z + } + `, + []string{"\tFMSUBS\t"}, + }, } var linuxARMTests = []*asmTest{ diff --git a/src/cmd/compile/internal/gc/const.go b/src/cmd/compile/internal/gc/const.go index 948d46d01f..42a10a9761 100644 --- a/src/cmd/compile/internal/gc/const.go +++ b/src/cmd/compile/internal/gc/const.go @@ -719,6 +719,7 @@ func evconst(n *Node) { case OCONV_ | CTINT_, OCONV_ | CTRUNE_, OCONV_ | CTFLT_, + OCONV_ | CTCPLX_, OCONV_ | CTSTR_, OCONV_ | CTBOOL_: nl = convlit1(nl, n.Type, true, false) diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index 11caacb9e9..1850cbde13 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -1287,8 +1287,8 @@ var fpConvOpToSSA = map[twoTypes]twoOpsAndType{ // float twoTypes{TFLOAT64, TFLOAT32}: twoOpsAndType{ssa.OpCvt64Fto32F, ssa.OpCopy, TFLOAT32}, - twoTypes{TFLOAT64, TFLOAT64}: twoOpsAndType{ssa.OpCopy, ssa.OpCopy, TFLOAT64}, - twoTypes{TFLOAT32, TFLOAT32}: twoOpsAndType{ssa.OpCopy, ssa.OpCopy, TFLOAT32}, + twoTypes{TFLOAT64, TFLOAT64}: twoOpsAndType{ssa.OpRound64F, ssa.OpCopy, TFLOAT64}, + twoTypes{TFLOAT32, TFLOAT32}: twoOpsAndType{ssa.OpRound32F, ssa.OpCopy, TFLOAT32}, twoTypes{TFLOAT32, TFLOAT64}: twoOpsAndType{ssa.OpCvt32Fto64F, ssa.OpCopy, TFLOAT64}, } @@ -1704,7 +1704,14 @@ func (s *state) expr(n *Node) *ssa.Value { if ft.IsComplex() && tt.IsComplex() { var op ssa.Op if ft.Size() == tt.Size() { - op = ssa.OpCopy + switch ft.Size() { + case 8: + op = ssa.OpRound32F + case 16: + op = ssa.OpRound64F + default: + s.Fatalf("weird complex conversion %v -> %v", ft, tt) + } } else if ft.Size() == 8 && tt.Size() == 16 { op = ssa.OpCvt32Fto64F } else if ft.Size() == 16 && tt.Size() == 8 { diff --git a/src/cmd/compile/internal/gc/testdata/fp.go b/src/cmd/compile/internal/gc/testdata/fp.go index 91656bef30..18082c5634 100644 --- a/src/cmd/compile/internal/gc/testdata/fp.go +++ b/src/cmd/compile/internal/gc/testdata/fp.go @@ -232,6 +232,141 @@ func integer2floatConversions() int { return fails } +func multiplyAdd() int { + fails := 0 + { + // Test that a multiply-accumulate operation with intermediate + // rounding forced by a float32() cast produces the expected + // result. + // Test cases generated experimentally on a system (s390x) that + // supports fused multiply-add instructions. + var tests = [...]struct{ x, y, z, res float32 }{ + {0.6046603, 0.9405091, 0.6645601, 1.2332485}, // fused multiply-add result: 1.2332486 + {0.67908466, 0.21855305, 0.20318687, 0.3516029}, // fused multiply-add result: 0.35160288 + {0.29311424, 0.29708257, 0.752573, 0.8396522}, // fused multiply-add result: 0.8396521 + {0.5305857, 0.2535405, 0.282081, 0.41660595}, // fused multiply-add result: 0.41660598 + {0.29711226, 0.89436173, 0.097454615, 0.36318043}, // fused multiply-add result: 0.36318046 + {0.6810783, 0.24151509, 0.31152245, 0.47601312}, // fused multiply-add result: 0.47601315 + {0.73023146, 0.18292491, 0.4283571, 0.5619346}, // fused multiply-add result: 0.56193465 + {0.89634174, 0.32208398, 0.7211478, 1.009845}, // fused multiply-add result: 1.0098451 + {0.6280982, 0.12675293, 0.2813303, 0.36094356}, // fused multiply-add result: 0.3609436 + {0.29400632, 0.75316125, 0.15096405, 0.3723982}, // fused multiply-add result: 0.37239823 + } + check := func(s string, got, expected float32) int { + if got != expected { + fmt.Printf("multiplyAdd: %s, expected %g, got %g\n", s, expected, got) + return 1 + } + return 0 + } + for _, t := range tests { + fails += check( + fmt.Sprintf("float32(%v * %v) + %v", t.x, t.y, t.z), + func(x, y, z float32) float32 { + return float32(x*y) + z + }(t.x, t.y, t.z), + t.res) + + fails += check( + fmt.Sprintf("%v += float32(%v * %v)", t.z, t.x, t.y), + func(x, y, z float32) float32 { + z += float32(x * y) + return z + }(t.x, t.y, t.z), + t.res) + } + } + { + // Test that a multiply-accumulate operation with intermediate + // rounding forced by a float64() cast produces the expected + // result. + // Test cases generated experimentally on a system (s390x) that + // supports fused multiply-add instructions. + var tests = [...]struct{ x, y, z, res float64 }{ + {0.4688898449024232, 0.28303415118044517, 0.29310185733681576, 0.42581369658590373}, // fused multiply-add result: 0.4258136965859037 + {0.7886049150193449, 0.3618054804803169, 0.8805431227416171, 1.1658647029293308}, // fused multiply-add result: 1.1658647029293305 + {0.7302314772948083, 0.18292491645390843, 0.4283570818068078, 0.5619346137829748}, // fused multiply-add result: 0.5619346137829747 + {0.6908388315056789, 0.7109071952999951, 0.5637795958152644, 1.0549018919252924}, // fused multiply-add result: 1.0549018919252926 + {0.4584424785756506, 0.6001655953233308, 0.02626515060968944, 0.3014065536855481}, // fused multiply-add result: 0.30140655368554814 + {0.539210105890946, 0.9756748149873165, 0.7507630564795985, 1.2768567767840384}, // fused multiply-add result: 1.2768567767840386 + {0.7830349733960021, 0.3932509992288867, 0.1304138461737918, 0.4383431318929343}, // fused multiply-add result: 0.43834313189293433 + {0.6841751300974551, 0.6530402051353608, 0.524499759549865, 0.9712936268572192}, // fused multiply-add result: 0.9712936268572193 + {0.3691117091643448, 0.826454125634742, 0.34768170859156955, 0.6527356034505334}, // fused multiply-add result: 0.6527356034505333 + {0.16867966833433606, 0.33136826030698385, 0.8279280961505588, 0.8838231843956668}, // fused multiply-add result: 0.8838231843956669 + } + check := func(s string, got, expected float64) int { + if got != expected { + fmt.Printf("multiplyAdd: %s, expected %g, got %g\n", s, expected, got) + return 1 + } + return 0 + } + for _, t := range tests { + fails += check( + fmt.Sprintf("float64(%v * %v) + %v", t.x, t.y, t.z), + func(x, y, z float64) float64 { + return float64(x*y) + z + }(t.x, t.y, t.z), + t.res) + + fails += check( + fmt.Sprintf("%v += float64(%v * %v)", t.z, t.x, t.y), + func(x, y, z float64) float64 { + z += float64(x * y) + return z + }(t.x, t.y, t.z), + t.res) + } + } + { + // Test that a multiply-accumulate operation with intermediate + // rounding forced by a complex128() cast produces the expected + // result. + // Test cases generated experimentally on a system (s390x) that + // supports fused multiply-add instructions. + var tests = [...]struct { + x, y float64 + res complex128 + }{ + {0.6046602879796196, 0.9405090880450124, (2.754489951983871 + 3i)}, // fused multiply-add result: (2.7544899519838713 + 3i) + {0.09696951891448456, 0.30091186058528707, (0.5918204173287407 + 3i)}, // fused multiply-add result: (0.5918204173287408 + 3i) + {0.544155573000885, 0.27850762181610883, (1.910974340818764 + 3i)}, // fused multiply-add result: (1.9109743408187638 + 3i) + {0.9769168685862624, 0.07429099894984302, (3.0050416047086297 + 3i)}, // fused multiply-add result: (3.00504160470863 + 3i) + {0.9269868035744142, 0.9549454404167818, (3.735905851140024 + 3i)}, // fused multiply-add result: (3.7359058511400245 + 3i) + {0.7109071952999951, 0.5637795958152644, (2.69650118171525 + 3i)}, // fused multiply-add result: (2.6965011817152496 + 3i) + {0.7558235074915978, 0.40380328579570035, (2.671273808270494 + 3i)}, // fused multiply-add result: (2.6712738082704934 + 3i) + {0.13065111702897217, 0.9859647293402467, (1.3779180804271633 + 3i)}, // fused multiply-add result: (1.3779180804271631 + 3i) + {0.8963417453962161, 0.3220839705208817, (3.0111092067095298 + 3i)}, // fused multiply-add result: (3.01110920670953 + 3i) + {0.39998376285699544, 0.497868113342702, (1.697819401913688 + 3i)}, // fused multiply-add result: (1.6978194019136883 + 3i) + } + check := func(s string, got, expected complex128) int { + if got != expected { + fmt.Printf("multiplyAdd: %s, expected %v, got %v\n", s, expected, got) + return 1 + } + return 0 + } + for _, t := range tests { + fails += check( + fmt.Sprintf("complex128(complex(%v, 1)*3) + complex(%v, 0)", t.x, t.y), + func(x, y float64) complex128 { + return complex128(complex(x, 1)*3) + complex(y, 0) + }(t.x, t.y), + t.res) + + fails += check( + fmt.Sprintf("z := complex(%v, 1); z += complex128(complex(%v, 1) * 3)", t.y, t.x), + func(x, y float64) complex128 { + z := complex(y, 0) + z += complex128(complex(x, 1) * 3) + return z + }(t.x, t.y), + t.res) + } + } + return fails +} + const ( aa = 0x1000000000000000 ab = 0x100000000000000 @@ -1658,6 +1793,8 @@ func main() { fails += integer2floatConversions() + fails += multiplyAdd() + var zero64 float64 = 0.0 var one64 float64 = 1.0 var inf64 float64 = 1.0 / zero64 diff --git a/src/cmd/compile/internal/gc/typecheck.go b/src/cmd/compile/internal/gc/typecheck.go index ba52b71200..1aca44bce5 100644 --- a/src/cmd/compile/internal/gc/typecheck.go +++ b/src/cmd/compile/internal/gc/typecheck.go @@ -1716,6 +1716,13 @@ OpSwitch: *r = *n n.Op = OLITERAL n.SetVal(n.Left.Val()) + } else if t.Etype == n.Type.Etype { + switch t.Etype { + case TFLOAT32, TFLOAT64, TCOMPLEX64, TCOMPLEX128: + // Floating point casts imply rounding and + // so the conversion must be kept. + n.Op = OCONV + } } // do not use stringtoarraylit. diff --git a/src/cmd/compile/internal/s390x/prog.go b/src/cmd/compile/internal/s390x/prog.go index fbe9291a24..07c1a45cd5 100644 --- a/src/cmd/compile/internal/s390x/prog.go +++ b/src/cmd/compile/internal/s390x/prog.go @@ -72,21 +72,25 @@ var progtable = [s390x.ALAST & obj.AMask]gc.ProgInfo{ s390x.AFLOGR & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite}, // Floating point. - s390x.AFADD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, - s390x.AFADDS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite}, - s390x.AFSUB & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, - s390x.AFSUBS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite}, - s390x.AFMUL & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, - s390x.AFMULS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite}, - s390x.AFDIV & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, - s390x.AFDIVS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite}, - s390x.AFCMPU & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightRead}, - s390x.ACEBR & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightRead}, - s390x.ALEDBR & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv}, - s390x.ALDEBR & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv}, - s390x.AFSQRT & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite}, - s390x.AFNEG & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite}, - s390x.AFNEGS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite}, + s390x.AFADD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, + s390x.AFADDS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite}, + s390x.AFSUB & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, + s390x.AFSUBS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite}, + s390x.AFMUL & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, + s390x.AFMULS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite}, + s390x.AFDIV & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite}, + s390x.AFDIVS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite}, + s390x.AFCMPU & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightRead}, + s390x.ACEBR & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightRead}, + s390x.ALEDBR & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv}, + s390x.ALDEBR & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite | gc.Conv}, + s390x.AFSQRT & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite}, + s390x.AFNEG & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RightWrite}, + s390x.AFNEGS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite}, + s390x.AFMADD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightRead | gc.RightWrite}, + s390x.AFMADDS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightRead | gc.RightWrite}, + s390x.AFMSUB & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightRead | gc.RightWrite}, + s390x.AFMSUBS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightRead | gc.RightWrite}, // Conversions s390x.ACEFBRA & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RightWrite | gc.Conv}, diff --git a/src/cmd/compile/internal/s390x/ssa.go b/src/cmd/compile/internal/s390x/ssa.go index f057e7315d..b349f01295 100644 --- a/src/cmd/compile/internal/s390x/ssa.go +++ b/src/cmd/compile/internal/s390x/ssa.go @@ -193,6 +193,20 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } opregreg(v.Op.Asm(), r, v.Args[1].Reg()) + case ssa.OpS390XFMADD, ssa.OpS390XFMADDS, + ssa.OpS390XFMSUB, ssa.OpS390XFMSUBS: + r := v.Reg() + if r != v.Args[0].Reg() { + v.Fatalf("input[0] and output not in same register %s", v.LongString()) + } + r1 := v.Args[1].Reg() + r2 := v.Args[2].Reg() + p := gc.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = r1 + p.Reg = r2 + p.To.Type = obj.TYPE_REG + p.To.Reg = r case ssa.OpS390XDIVD, ssa.OpS390XDIVW, ssa.OpS390XDIVDU, ssa.OpS390XDIVWU, ssa.OpS390XMODD, ssa.OpS390XMODW, @@ -465,6 +479,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { case ssa.OpS390XLoweredGetClosurePtr: // Closure pointer is R12 (already) gc.CheckLoweredGetClosurePtr(v) + case ssa.OpS390XLoweredRound32F, ssa.OpS390XLoweredRound64F: + // input is already rounded case ssa.OpS390XLoweredGetG: r := v.Reg() p := gc.Prog(s390x.AMOVD) diff --git a/src/cmd/compile/internal/ssa/gen/386.rules b/src/cmd/compile/internal/ssa/gen/386.rules index c3503860d8..3640857b58 100644 --- a/src/cmd/compile/internal/ssa/gen/386.rules +++ b/src/cmd/compile/internal/ssa/gen/386.rules @@ -122,6 +122,9 @@ (Cvt32Fto64F x) -> (CVTSS2SD x) (Cvt64Fto32F x) -> (CVTSD2SS x) +(Round32F x) -> x +(Round64F x) -> x + // Lowering shifts // Unsigned shifts need to return 0 if shift amount is >= width of shifted value. // result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff) diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index a480b3e7bc..622f87ef36 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -150,6 +150,9 @@ (Cvt32Fto64F x) -> (CVTSS2SD x) (Cvt64Fto32F x) -> (CVTSD2SS x) +(Round32F x) -> x +(Round64F x) -> x + // Lowering shifts // Unsigned shifts need to return 0 if shift amount is >= width of shifted value. // result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff) diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules index 7f5bc9e510..fc17573ee3 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM.rules @@ -220,6 +220,9 @@ (Cvt32Fto64F x) -> (MOVFD x) (Cvt64Fto32F x) -> (MOVDF x) +(Round32F x) -> x +(Round64F x) -> x + // comparisons (Eq8 x y) -> (Equal (CMP (ZeroExt8to32 x) (ZeroExt8to32 y))) (Eq16 x y) -> (Equal (CMP (ZeroExt16to32 x) (ZeroExt16to32 y))) diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index cd9bfc0b74..94d5902abd 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -249,6 +249,9 @@ (Cvt32Fto64F x) -> (FCVTSD x) (Cvt64Fto32F x) -> (FCVTDS x) +(Round32F x) -> x +(Round64F x) -> x + // comparisons (Eq8 x y) -> (Equal (CMPW (ZeroExt8to32 x) (ZeroExt8to32 y))) (Eq16 x y) -> (Equal (CMPW (ZeroExt16to32 x) (ZeroExt16to32 y))) diff --git a/src/cmd/compile/internal/ssa/gen/MIPS.rules b/src/cmd/compile/internal/ssa/gen/MIPS.rules index 1baa0028e0..42aef79f56 100644 --- a/src/cmd/compile/internal/ssa/gen/MIPS.rules +++ b/src/cmd/compile/internal/ssa/gen/MIPS.rules @@ -194,6 +194,9 @@ (Cvt32Fto64F x) -> (MOVFD x) (Cvt64Fto32F x) -> (MOVDF x) +(Round32F x) -> x +(Round64F x) -> x + // comparisons (Eq8 x y) -> (SGTUconst [1] (XOR (ZeroExt8to32 x) (ZeroExt8to32 y))) (Eq16 x y) -> (SGTUconst [1] (XOR (ZeroExt16to32 x) (ZeroExt16to32 y))) diff --git a/src/cmd/compile/internal/ssa/gen/MIPS64.rules b/src/cmd/compile/internal/ssa/gen/MIPS64.rules index 47487bff36..720f91d747 100644 --- a/src/cmd/compile/internal/ssa/gen/MIPS64.rules +++ b/src/cmd/compile/internal/ssa/gen/MIPS64.rules @@ -203,6 +203,9 @@ (Cvt32Fto64F x) -> (MOVFD x) (Cvt64Fto32F x) -> (MOVDF x) +(Round32F x) -> x +(Round64F x) -> x + // comparisons (Eq8 x y) -> (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y))) (Eq16 x y) -> (SGTU (MOVVconst [1]) (XOR (ZeroExt16to64 x) (ZeroExt16to64 y))) diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules index 56605dc1a0..f44bb34f9e 100644 --- a/src/cmd/compile/internal/ssa/gen/PPC64.rules +++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules @@ -74,6 +74,9 @@ (Cvt32Fto64F x) -> x // Note x will have the wrong type for patterns dependent on Float32/Float64 (Cvt64Fto32F x) -> (FRSP x) +(Round32F x) -> x +(Round64F x) -> x + (Sqrt x) -> (FSQRT x) // Lowering constants diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules index 0b5aa8e496..82468ec2c3 100644 --- a/src/cmd/compile/internal/ssa/gen/S390X.rules +++ b/src/cmd/compile/internal/ssa/gen/S390X.rules @@ -178,6 +178,9 @@ (Cvt32Fto64F x) -> (LDEBR x) (Cvt64Fto32F x) -> (LEDBR x) +(Round32F x) -> (LoweredRound32F x) +(Round64F x) -> (LoweredRound64F x) + // Lowering shifts // Unsigned shifts need to return 0 if shift amount is >= width of shifted value. // result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff) @@ -1010,6 +1013,8 @@ (XOR (MOVDconst [c]) (MOVDconst [d])) -> (MOVDconst [c^d]) (XORconst [c] (MOVDconst [d])) -> (MOVDconst [c^d]) (XORWconst [c] (MOVDconst [d])) -> (MOVDconst [c^d]) +(LoweredRound32F x:(FMOVSconst)) -> x +(LoweredRound64F x:(FMOVDconst)) -> x // generic simplifications // TODO: more of this @@ -1024,6 +1029,14 @@ (XOR x x) -> (MOVDconst [0]) (XORW x x) -> (MOVDconst [0]) +// fused multiply-add +(FADD x (FMUL y z)) -> (FMADD x y z) +(FADDS x (FMULS y z)) -> (FMADDS x y z) +(FADD (FMUL y z) x) -> (FMADD x y z) +(FADDS (FMULS y z) x) -> (FMADDS x y z) +(FSUB (FMUL y z) x) -> (FMSUB x y z) +(FSUBS (FMULS y z) x) -> (FMSUBS x y z) + // Fold memory operations into operations. // Exclude global data (SB) because these instructions cannot handle relative addresses. // TODO(mundaym): use LARL in the assembler to handle SB? diff --git a/src/cmd/compile/internal/ssa/gen/S390XOps.go b/src/cmd/compile/internal/ssa/gen/S390XOps.go index 40ba252941..11f6656197 100644 --- a/src/cmd/compile/internal/ssa/gen/S390XOps.go +++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go @@ -141,6 +141,7 @@ func init() { fp01 = regInfo{inputs: []regMask{}, outputs: fponly} fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly} + fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly} fp21clobber = regInfo{inputs: []regMask{fp, fp}, outputs: fponly} fpgp = regInfo{inputs: fponly, outputs: gponly} gpfp = regInfo{inputs: gponly, outputs: fponly} @@ -166,16 +167,20 @@ func init() { var S390Xops = []opData{ // fp ops - {name: "FADDS", argLength: 2, reg: fp21clobber, asm: "FADDS", commutative: true, resultInArg0: true, clobberFlags: true}, // fp32 add - {name: "FADD", argLength: 2, reg: fp21clobber, asm: "FADD", commutative: true, resultInArg0: true, clobberFlags: true}, // fp64 add - {name: "FSUBS", argLength: 2, reg: fp21clobber, asm: "FSUBS", resultInArg0: true, clobberFlags: true}, // fp32 sub - {name: "FSUB", argLength: 2, reg: fp21clobber, asm: "FSUB", resultInArg0: true, clobberFlags: true}, // fp64 sub - {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, resultInArg0: true}, // fp32 mul - {name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true, resultInArg0: true}, // fp64 mul - {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", resultInArg0: true}, // fp32 div - {name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV", resultInArg0: true}, // fp64 div - {name: "FNEGS", argLength: 1, reg: fp11clobber, asm: "FNEGS", clobberFlags: true}, // fp32 neg - {name: "FNEG", argLength: 1, reg: fp11clobber, asm: "FNEG", clobberFlags: true}, // fp64 neg + {name: "FADDS", argLength: 2, reg: fp21clobber, asm: "FADDS", commutative: true, resultInArg0: true, clobberFlags: true}, // fp32 arg0 + arg1 + {name: "FADD", argLength: 2, reg: fp21clobber, asm: "FADD", commutative: true, resultInArg0: true, clobberFlags: true}, // fp64 arg0 + arg1 + {name: "FSUBS", argLength: 2, reg: fp21clobber, asm: "FSUBS", resultInArg0: true, clobberFlags: true}, // fp32 arg0 - arg1 + {name: "FSUB", argLength: 2, reg: fp21clobber, asm: "FSUB", resultInArg0: true, clobberFlags: true}, // fp64 arg0 - arg1 + {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true, resultInArg0: true}, // fp32 arg0 * arg1 + {name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true, resultInArg0: true}, // fp64 arg0 * arg1 + {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS", resultInArg0: true}, // fp32 arg0 / arg1 + {name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV", resultInArg0: true}, // fp64 arg0 / arg1 + {name: "FNEGS", argLength: 1, reg: fp11clobber, asm: "FNEGS", clobberFlags: true}, // fp32 -arg0 + {name: "FNEG", argLength: 1, reg: fp11clobber, asm: "FNEG", clobberFlags: true}, // fp64 -arg0 + {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS", resultInArg0: true}, // fp32 arg1 * arg2 + arg0 + {name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD", resultInArg0: true}, // fp64 arg1 * arg2 + arg0 + {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS", resultInArg0: true}, // fp32 arg1 * arg2 - arg0 + {name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB", resultInArg0: true}, // fp64 arg1 * arg2 - arg0 {name: "FMOVSload", argLength: 2, reg: fpload, asm: "FMOVS", aux: "SymOff", faultOnNilArg0: true}, // fp32 load {name: "FMOVDload", argLength: 2, reg: fpload, asm: "FMOVD", aux: "SymOff", faultOnNilArg0: true}, // fp64 load @@ -402,6 +407,9 @@ func init() { {name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R12")}}}, // arg0=ptr,arg1=mem, returns void. Faults if ptr is nil. {name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{ptrsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true}, + // Round ops to block fused-multiply-add extraction. + {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true}, + {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true}, // MOVDconvert converts between pointers and integers. // We have a special op for this so as to not confuse GC diff --git a/src/cmd/compile/internal/ssa/gen/generic.rules b/src/cmd/compile/internal/ssa/gen/generic.rules index e0a12dcae5..f485f43875 100644 --- a/src/cmd/compile/internal/ssa/gen/generic.rules +++ b/src/cmd/compile/internal/ssa/gen/generic.rules @@ -46,6 +46,8 @@ (Trunc64to32 (Const64 [c])) -> (Const32 [int64(int32(c))]) (Cvt64Fto32F (Const64F [c])) -> (Const32F [f2i(float64(i2f32(c)))]) (Cvt32Fto64F (Const32F [c])) -> (Const64F [c]) // c is already a 64 bit float +(Round32F x:(Const32F)) -> x +(Round64F x:(Const64F)) -> x (Trunc16to8 (ZeroExt8to16 x)) -> x (Trunc32to8 (ZeroExt8to32 x)) -> x diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go index 3854a3954d..ab8b0ab765 100644 --- a/src/cmd/compile/internal/ssa/gen/genericOps.go +++ b/src/cmd/compile/internal/ssa/gen/genericOps.go @@ -343,6 +343,10 @@ var genericOps = []opData{ {name: "Cvt32Fto64F", argLength: 1}, {name: "Cvt64Fto32F", argLength: 1}, + // Force rounding to precision of type. + {name: "Round32F", argLength: 1}, + {name: "Round64F", argLength: 1}, + // Automatically inserted safety checks {name: "IsNonNil", argLength: 1, typ: "Bool"}, // arg0 != nil {name: "IsInBounds", argLength: 2, typ: "Bool"}, // 0 <= arg0 < arg1. arg1 is guaranteed >= 0. diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 7a962165c5..9e10376128 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1390,6 +1390,10 @@ const ( OpS390XFDIV OpS390XFNEGS OpS390XFNEG + OpS390XFMADDS + OpS390XFMADD + OpS390XFMSUBS + OpS390XFMSUB OpS390XFMOVSload OpS390XFMOVDload OpS390XFMOVSconst @@ -1554,6 +1558,8 @@ const ( OpS390XLoweredGetG OpS390XLoweredGetClosurePtr OpS390XLoweredNilCheck + OpS390XLoweredRound32F + OpS390XLoweredRound64F OpS390XMOVDconvert OpS390XFlagEQ OpS390XFlagLT @@ -1830,6 +1836,8 @@ const ( OpCvt64Fto64 OpCvt32Fto64F OpCvt64Fto32F + OpRound32F + OpRound64F OpIsNonNil OpIsInBounds OpIsSliceInBounds @@ -17446,6 +17454,70 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "FMADDS", + argLen: 3, + resultInArg0: true, + asm: s390x.AFMADDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {2, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, + { + name: "FMADD", + argLen: 3, + resultInArg0: true, + asm: s390x.AFMADD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {2, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, + { + name: "FMSUBS", + argLen: 3, + resultInArg0: true, + asm: s390x.AFMSUBS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {2, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, + { + name: "FMSUB", + argLen: 3, + resultInArg0: true, + asm: s390x.AFMSUB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + {2, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, { name: "FMOVSload", auxType: auxSymOff, @@ -19801,6 +19873,32 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "LoweredRound32F", + argLen: 1, + resultInArg0: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, + { + name: "LoweredRound64F", + argLen: 1, + resultInArg0: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + outputs: []outputInfo{ + {0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 + }, + }, + }, { name: "MOVDconvert", argLen: 2, @@ -21457,6 +21555,16 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Round32F", + argLen: 1, + generic: true, + }, + { + name: "Round64F", + argLen: 1, + generic: true, + }, { name: "IsNonNil", argLen: 1, diff --git a/src/cmd/compile/internal/ssa/rewrite386.go b/src/cmd/compile/internal/ssa/rewrite386.go index a396ec1976..417f59fc0a 100644 --- a/src/cmd/compile/internal/ssa/rewrite386.go +++ b/src/cmd/compile/internal/ssa/rewrite386.go @@ -488,6 +488,10 @@ func rewriteValue386(v *Value, config *Config) bool { return rewriteValue386_OpOr8(v, config) case OpOrB: return rewriteValue386_OpOrB(v, config) + case OpRound32F: + return rewriteValue386_OpRound32F(v, config) + case OpRound64F: + return rewriteValue386_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValue386_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -12209,6 +12213,34 @@ func rewriteValue386_OpOrB(v *Value, config *Config) bool { return true } } +func rewriteValue386_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} +func rewriteValue386_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} func rewriteValue386_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 2668a16867..00a554d01d 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -678,6 +678,10 @@ func rewriteValueAMD64(v *Value, config *Config) bool { return rewriteValueAMD64_OpOr8(v, config) case OpOrB: return rewriteValueAMD64_OpOrB(v, config) + case OpRound32F: + return rewriteValueAMD64_OpRound32F(v, config) + case OpRound64F: + return rewriteValueAMD64_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValueAMD64_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -20498,6 +20502,34 @@ func rewriteValueAMD64_OpOrB(v *Value, config *Config) bool { return true } } +func rewriteValueAMD64_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} func rewriteValueAMD64_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteARM.go b/src/cmd/compile/internal/ssa/rewriteARM.go index 2ad662f8fe..a5b766ebbb 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM.go +++ b/src/cmd/compile/internal/ssa/rewriteARM.go @@ -620,6 +620,10 @@ func rewriteValueARM(v *Value, config *Config) bool { return rewriteValueARM_OpOr8(v, config) case OpOrB: return rewriteValueARM_OpOrB(v, config) + case OpRound32F: + return rewriteValueARM_OpRound32F(v, config) + case OpRound64F: + return rewriteValueARM_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValueARM_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -15772,6 +15776,34 @@ func rewriteValueARM_OpOrB(v *Value, config *Config) bool { return true } } +func rewriteValueARM_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} +func rewriteValueARM_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} func rewriteValueARM_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 0e60aaad85..93472934ae 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -576,6 +576,10 @@ func rewriteValueARM64(v *Value, config *Config) bool { return rewriteValueARM64_OpOr8(v, config) case OpOrB: return rewriteValueARM64_OpOrB(v, config) + case OpRound32F: + return rewriteValueARM64_OpRound32F(v, config) + case OpRound64F: + return rewriteValueARM64_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValueARM64_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -13172,6 +13176,34 @@ func rewriteValueARM64_OpOrB(v *Value, config *Config) bool { return true } } +func rewriteValueARM64_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} +func rewriteValueARM64_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} func rewriteValueARM64_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteMIPS.go b/src/cmd/compile/internal/ssa/rewriteMIPS.go index 2c320a9216..5c45d1c481 100644 --- a/src/cmd/compile/internal/ssa/rewriteMIPS.go +++ b/src/cmd/compile/internal/ssa/rewriteMIPS.go @@ -400,6 +400,10 @@ func rewriteValueMIPS(v *Value, config *Config) bool { return rewriteValueMIPS_OpOr8(v, config) case OpOrB: return rewriteValueMIPS_OpOrB(v, config) + case OpRound32F: + return rewriteValueMIPS_OpRound32F(v, config) + case OpRound64F: + return rewriteValueMIPS_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValueMIPS_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -7063,6 +7067,34 @@ func rewriteValueMIPS_OpOrB(v *Value, config *Config) bool { return true } } +func rewriteValueMIPS_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} +func rewriteValueMIPS_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} func rewriteValueMIPS_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteMIPS64.go b/src/cmd/compile/internal/ssa/rewriteMIPS64.go index f3d0fe3aa6..ef07ab0268 100644 --- a/src/cmd/compile/internal/ssa/rewriteMIPS64.go +++ b/src/cmd/compile/internal/ssa/rewriteMIPS64.go @@ -436,6 +436,10 @@ func rewriteValueMIPS64(v *Value, config *Config) bool { return rewriteValueMIPS64_OpOr8(v, config) case OpOrB: return rewriteValueMIPS64_OpOrB(v, config) + case OpRound32F: + return rewriteValueMIPS64_OpRound32F(v, config) + case OpRound64F: + return rewriteValueMIPS64_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValueMIPS64_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -7377,6 +7381,34 @@ func rewriteValueMIPS64_OpOrB(v *Value, config *Config) bool { return true } } +func rewriteValueMIPS64_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} +func rewriteValueMIPS64_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} func rewriteValueMIPS64_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewritePPC64.go b/src/cmd/compile/internal/ssa/rewritePPC64.go index 2a8bc65d1b..bb6678b590 100644 --- a/src/cmd/compile/internal/ssa/rewritePPC64.go +++ b/src/cmd/compile/internal/ssa/rewritePPC64.go @@ -436,6 +436,10 @@ func rewriteValuePPC64(v *Value, config *Config) bool { return rewriteValuePPC64_OpPPC64XOR(v, config) case OpPPC64XORconst: return rewriteValuePPC64_OpPPC64XORconst(v, config) + case OpRound32F: + return rewriteValuePPC64_OpRound32F(v, config) + case OpRound64F: + return rewriteValuePPC64_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValuePPC64_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -7424,6 +7428,34 @@ func rewriteValuePPC64_OpPPC64XORconst(v *Value, config *Config) bool { } return false } +func rewriteValuePPC64_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} +func rewriteValuePPC64_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x) + // cond: + // result: x + for { + x := v.Args[0] + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } +} func rewriteValuePPC64_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewriteS390X.go b/src/cmd/compile/internal/ssa/rewriteS390X.go index 358c28e529..29ea57c948 100644 --- a/src/cmd/compile/internal/ssa/rewriteS390X.go +++ b/src/cmd/compile/internal/ssa/rewriteS390X.go @@ -374,6 +374,10 @@ func rewriteValueS390X(v *Value, config *Config) bool { return rewriteValueS390X_OpOr8(v, config) case OpOrB: return rewriteValueS390X_OpOrB(v, config) + case OpRound32F: + return rewriteValueS390X_OpRound32F(v, config) + case OpRound64F: + return rewriteValueS390X_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValueS390X_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -470,6 +474,10 @@ func rewriteValueS390X(v *Value, config *Config) bool { return rewriteValueS390X_OpS390XCMPWconst(v, config) case OpS390XCMPconst: return rewriteValueS390X_OpS390XCMPconst(v, config) + case OpS390XFADD: + return rewriteValueS390X_OpS390XFADD(v, config) + case OpS390XFADDS: + return rewriteValueS390X_OpS390XFADDS(v, config) case OpS390XFMOVDload: return rewriteValueS390X_OpS390XFMOVDload(v, config) case OpS390XFMOVDloadidx: @@ -486,6 +494,14 @@ func rewriteValueS390X(v *Value, config *Config) bool { return rewriteValueS390X_OpS390XFMOVSstore(v, config) case OpS390XFMOVSstoreidx: return rewriteValueS390X_OpS390XFMOVSstoreidx(v, config) + case OpS390XFSUB: + return rewriteValueS390X_OpS390XFSUB(v, config) + case OpS390XFSUBS: + return rewriteValueS390X_OpS390XFSUBS(v, config) + case OpS390XLoweredRound32F: + return rewriteValueS390X_OpS390XLoweredRound32F(v, config) + case OpS390XLoweredRound64F: + return rewriteValueS390X_OpS390XLoweredRound64F(v, config) case OpS390XMOVBZload: return rewriteValueS390X_OpS390XMOVBZload(v, config) case OpS390XMOVBZloadidx: @@ -4820,6 +4836,32 @@ func rewriteValueS390X_OpOrB(v *Value, config *Config) bool { return true } } +func rewriteValueS390X_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x) + // cond: + // result: (LoweredRound32F x) + for { + x := v.Args[0] + v.reset(OpS390XLoweredRound32F) + v.AddArg(x) + return true + } +} +func rewriteValueS390X_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x) + // cond: + // result: (LoweredRound64F x) + for { + x := v.Args[0] + v.reset(OpS390XLoweredRound64F) + v.AddArg(x) + return true + } +} func rewriteValueS390X_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b @@ -7271,6 +7313,84 @@ func rewriteValueS390X_OpS390XCMPconst(v *Value, config *Config) bool { } return false } +func rewriteValueS390X_OpS390XFADD(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (FADD x (FMUL y z)) + // cond: + // result: (FMADD x y z) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpS390XFMUL { + break + } + y := v_1.Args[0] + z := v_1.Args[1] + v.reset(OpS390XFMADD) + v.AddArg(x) + v.AddArg(y) + v.AddArg(z) + return true + } + // match: (FADD (FMUL y z) x) + // cond: + // result: (FMADD x y z) + for { + v_0 := v.Args[0] + if v_0.Op != OpS390XFMUL { + break + } + y := v_0.Args[0] + z := v_0.Args[1] + x := v.Args[1] + v.reset(OpS390XFMADD) + v.AddArg(x) + v.AddArg(y) + v.AddArg(z) + return true + } + return false +} +func rewriteValueS390X_OpS390XFADDS(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (FADDS x (FMULS y z)) + // cond: + // result: (FMADDS x y z) + for { + x := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpS390XFMULS { + break + } + y := v_1.Args[0] + z := v_1.Args[1] + v.reset(OpS390XFMADDS) + v.AddArg(x) + v.AddArg(y) + v.AddArg(z) + return true + } + // match: (FADDS (FMULS y z) x) + // cond: + // result: (FMADDS x y z) + for { + v_0 := v.Args[0] + if v_0.Op != OpS390XFMULS { + break + } + y := v_0.Args[0] + z := v_0.Args[1] + x := v.Args[1] + v.reset(OpS390XFMADDS) + v.AddArg(x) + v.AddArg(y) + v.AddArg(z) + return true + } + return false +} func rewriteValueS390X_OpS390XFMOVDload(v *Value, config *Config) bool { b := v.Block _ = b @@ -7899,6 +8019,86 @@ func rewriteValueS390X_OpS390XFMOVSstoreidx(v *Value, config *Config) bool { } return false } +func rewriteValueS390X_OpS390XFSUB(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (FSUB (FMUL y z) x) + // cond: + // result: (FMSUB x y z) + for { + v_0 := v.Args[0] + if v_0.Op != OpS390XFMUL { + break + } + y := v_0.Args[0] + z := v_0.Args[1] + x := v.Args[1] + v.reset(OpS390XFMSUB) + v.AddArg(x) + v.AddArg(y) + v.AddArg(z) + return true + } + return false +} +func rewriteValueS390X_OpS390XFSUBS(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (FSUBS (FMULS y z) x) + // cond: + // result: (FMSUBS x y z) + for { + v_0 := v.Args[0] + if v_0.Op != OpS390XFMULS { + break + } + y := v_0.Args[0] + z := v_0.Args[1] + x := v.Args[1] + v.reset(OpS390XFMSUBS) + v.AddArg(x) + v.AddArg(y) + v.AddArg(z) + return true + } + return false +} +func rewriteValueS390X_OpS390XLoweredRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (LoweredRound32F x:(FMOVSconst)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpS390XFMOVSconst { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + return false +} +func rewriteValueS390X_OpS390XLoweredRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (LoweredRound64F x:(FMOVDconst)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpS390XFMOVDconst { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + return false +} func rewriteValueS390X_OpS390XMOVBZload(v *Value, config *Config) bool { b := v.Block _ = b diff --git a/src/cmd/compile/internal/ssa/rewritegeneric.go b/src/cmd/compile/internal/ssa/rewritegeneric.go index 57e258b029..3033a31f98 100644 --- a/src/cmd/compile/internal/ssa/rewritegeneric.go +++ b/src/cmd/compile/internal/ssa/rewritegeneric.go @@ -270,6 +270,10 @@ func rewriteValuegeneric(v *Value, config *Config) bool { return rewriteValuegeneric_OpPhi(v, config) case OpPtrIndex: return rewriteValuegeneric_OpPtrIndex(v, config) + case OpRound32F: + return rewriteValuegeneric_OpRound32F(v, config) + case OpRound64F: + return rewriteValuegeneric_OpRound64F(v, config) case OpRsh16Ux16: return rewriteValuegeneric_OpRsh16Ux16(v, config) case OpRsh16Ux32: @@ -9786,6 +9790,42 @@ func rewriteValuegeneric_OpPtrIndex(v *Value, config *Config) bool { } return false } +func rewriteValuegeneric_OpRound32F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round32F x:(Const32F)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpConst32F { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + return false +} +func rewriteValuegeneric_OpRound64F(v *Value, config *Config) bool { + b := v.Block + _ = b + // match: (Round64F x:(Const64F)) + // cond: + // result: x + for { + x := v.Args[0] + if x.Op != OpConst64F { + break + } + v.reset(OpCopy) + v.Type = x.Type + v.AddArg(x) + return true + } + return false +} func rewriteValuegeneric_OpRsh16Ux16(v *Value, config *Config) bool { b := v.Block _ = b -- 2.50.0