From: Ben Shi Date: Sat, 17 Feb 2018 12:57:44 +0000 (+0000) Subject: cmd/compile: improve FP performance on ARM64 X-Git-Tag: go1.11beta1~1492 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=f4c3072cf53889b2f11c97ee0bca53925b791f5f;p=gostls13.git cmd/compile: improve FP performance on ARM64 FMADD/FMSUB/FNMADD/FNMSUB are efficient FP instructions, which can be used by the comiler to improve FP performance. This CL implements this optimization. 1. The compilecmp benchmark shows little change. name old time/op new time/op delta Template 2.35s ± 4% 2.38s ± 4% ~ (p=0.161 n=15+15) Unicode 1.36s ± 5% 1.36s ± 4% ~ (p=0.685 n=14+13) GoTypes 8.11s ± 3% 8.13s ± 2% ~ (p=0.624 n=15+15) Compiler 40.5s ± 2% 40.7s ± 2% ~ (p=0.137 n=15+15) SSA 115s ± 3% 116s ± 1% ~ (p=0.270 n=15+14) Flate 1.46s ± 4% 1.45s ± 5% ~ (p=0.870 n=15+15) GoParser 1.85s ± 2% 1.87s ± 3% ~ (p=0.477 n=14+15) Reflect 5.11s ± 4% 5.10s ± 2% ~ (p=0.624 n=15+15) Tar 2.23s ± 3% 2.23s ± 5% ~ (p=0.624 n=15+15) XML 2.72s ± 5% 2.74s ± 3% ~ (p=0.290 n=15+14) [Geo mean] 5.02s 5.03s +0.29% name old user-time/op new user-time/op delta Template 2.90s ± 2% 2.90s ± 3% ~ (p=0.780 n=14+15) Unicode 1.71s ± 5% 1.70s ± 3% ~ (p=0.458 n=14+13) GoTypes 9.77s ± 2% 9.76s ± 2% ~ (p=0.838 n=15+15) Compiler 49.1s ± 2% 49.1s ± 2% ~ (p=0.902 n=15+15) SSA 144s ± 1% 144s ± 2% ~ (p=0.567 n=15+15) Flate 1.75s ± 5% 1.74s ± 3% ~ (p=0.461 n=15+15) GoParser 2.22s ± 2% 2.21s ± 3% ~ (p=0.233 n=15+15) Reflect 5.99s ± 2% 5.95s ± 1% ~ (p=0.093 n=14+15) Tar 2.68s ± 2% 2.67s ± 3% ~ (p=0.310 n=14+15) XML 3.22s ± 2% 3.24s ± 3% ~ (p=0.512 n=15+15) [Geo mean] 6.08s 6.07s -0.19% name old text-bytes new text-bytes delta HelloSize 641kB ± 0% 641kB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 9.46kB ± 0% 9.46kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 125kB ± 0% 125kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.24MB ± 0% 1.24MB ± 0% ~ (all equal) 2. The go1 benchmark shows little improvement in total (excluding noise), but some improvement in test case Mandelbrot200 and FmtFprintfFloat. name old time/op new time/op delta BinaryTree17-4 42.1s ± 2% 42.0s ± 2% ~ (p=0.453 n=30+28) Fannkuch11-4 33.5s ± 3% 33.3s ± 3% -0.38% (p=0.045 n=30+30) FmtFprintfEmpty-4 534ns ± 0% 534ns ± 0% ~ (all equal) FmtFprintfString-4 1.09µs ± 0% 1.09µs ± 0% -0.27% (p=0.000 n=23+17) FmtFprintfInt-4 1.16µs ± 3% 1.16µs ± 3% ~ (p=0.714 n=30+30) FmtFprintfIntInt-4 1.76µs ± 1% 1.77µs ± 0% +0.15% (p=0.002 n=23+23) FmtFprintfPrefixedInt-4 2.21µs ± 3% 2.20µs ± 3% ~ (p=0.390 n=30+30) FmtFprintfFloat-4 3.28µs ± 0% 3.11µs ± 0% -5.01% (p=0.000 n=25+26) FmtManyArgs-4 7.18µs ± 0% 7.19µs ± 0% +0.13% (p=0.000 n=24+25) GobDecode-4 94.9ms ± 0% 95.6ms ± 5% +0.83% (p=0.002 n=23+29) GobEncode-4 80.7ms ± 4% 79.8ms ± 0% -1.11% (p=0.003 n=30+24) Gzip-4 4.58s ± 4% 4.59s ± 3% +0.26% (p=0.002 n=30+26) Gunzip-4 449ms ± 4% 443ms ± 0% ~ (p=0.096 n=30+26) HTTPClientServer-4 553µs ± 1% 548µs ± 1% -0.96% (p=0.000 n=30+30) JSONEncode-4 215ms ± 4% 214ms ± 4% -0.29% (p=0.000 n=30+30) JSONDecode-4 868ms ± 4% 875ms ± 5% +0.79% (p=0.008 n=30+30) Mandelbrot200-4 51.4ms ± 0% 46.7ms ± 3% -9.09% (p=0.000 n=25+26) GoParse-4 42.1ms ± 0% 41.8ms ± 0% -0.61% (p=0.000 n=25+24) RegexpMatchEasy0_32-4 1.02µs ± 4% 1.02µs ± 4% -0.17% (p=0.000 n=30+30) RegexpMatchEasy0_1K-4 3.90µs ± 0% 3.95µs ± 4% ~ (p=0.516 n=23+30) RegexpMatchEasy1_32-4 970ns ± 3% 973ns ± 3% ~ (p=0.951 n=30+30) RegexpMatchEasy1_1K-4 6.43µs ± 3% 6.33µs ± 0% -1.62% (p=0.000 n=30+25) RegexpMatchMedium_32-4 1.75µs ± 0% 1.75µs ± 0% ~ (p=0.422 n=25+24) RegexpMatchMedium_1K-4 568µs ± 3% 562µs ± 0% ~ (p=0.079 n=30+24) RegexpMatchHard_32-4 30.8µs ± 0% 31.2µs ± 4% +1.46% (p=0.018 n=23+30) RegexpMatchHard_1K-4 932µs ± 0% 946µs ± 3% +1.49% (p=0.000 n=24+30) Revcomp-4 7.69s ± 3% 7.69s ± 2% +0.04% (p=0.032 n=24+25) Template-4 893ms ± 5% 880ms ± 6% -1.53% (p=0.000 n=30+30) TimeParse-4 4.90µs ± 3% 4.84µs ± 0% ~ (p=0.080 n=30+25) TimeFormat-4 4.70µs ± 1% 4.76µs ± 0% +1.21% (p=0.000 n=23+26) [Geo mean] 710µs 706µs -0.63% name old speed new speed delta GobDecode-4 8.09MB/s ± 0% 8.03MB/s ± 5% -0.77% (p=0.002 n=23+29) GobEncode-4 9.52MB/s ± 4% 9.62MB/s ± 0% +1.07% (p=0.003 n=30+24) Gzip-4 4.24MB/s ± 4% 4.23MB/s ± 3% -0.35% (p=0.002 n=30+26) Gunzip-4 43.2MB/s ± 4% 43.8MB/s ± 0% ~ (p=0.123 n=30+26) JSONEncode-4 9.03MB/s ± 4% 9.06MB/s ± 4% +0.28% (p=0.000 n=30+30) JSONDecode-4 2.24MB/s ± 4% 2.22MB/s ± 5% -0.79% (p=0.008 n=30+30) GoParse-4 1.38MB/s ± 1% 1.38MB/s ± 0% ~ (p=0.401 n=25+17) RegexpMatchEasy0_32-4 31.4MB/s ± 4% 31.5MB/s ± 3% +0.16% (p=0.000 n=30+30) RegexpMatchEasy0_1K-4 262MB/s ± 0% 259MB/s ± 4% ~ (p=0.693 n=23+30) RegexpMatchEasy1_32-4 33.0MB/s ± 3% 32.9MB/s ± 3% ~ (p=0.139 n=30+30) RegexpMatchEasy1_1K-4 159MB/s ± 3% 162MB/s ± 0% +1.60% (p=0.000 n=30+25) RegexpMatchMedium_32-4 570kB/s ± 0% 570kB/s ± 0% ~ (all equal) RegexpMatchMedium_1K-4 1.80MB/s ± 3% 1.82MB/s ± 0% +1.09% (p=0.007 n=30+24) RegexpMatchHard_32-4 1.04MB/s ± 0% 1.03MB/s ± 3% -1.38% (p=0.003 n=23+30) RegexpMatchHard_1K-4 1.10MB/s ± 0% 1.08MB/s ± 3% -1.52% (p=0.000 n=24+30) Revcomp-4 33.0MB/s ± 3% 33.0MB/s ± 2% ~ (p=0.128 n=24+25) Template-4 2.17MB/s ± 5% 2.21MB/s ± 6% +1.61% (p=0.000 n=30+30) [Geo mean] 7.79MB/s 7.79MB/s +0.05% Change-Id: Ied3dbdb5ba8e386168629cba06fcd4263bbb83e1 Reviewed-on: https://go-review.googlesource.com/94901 Run-TryBot: Brad Fitzpatrick Reviewed-by: Brad Fitzpatrick --- diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index 574fb9ff73..795b1a74c5 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -186,6 +186,25 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.Reg = r1 p.To.Type = obj.TYPE_REG p.To.Reg = r + case ssa.OpARM64FMADDS, + ssa.OpARM64FMADDD, + ssa.OpARM64FNMADDS, + ssa.OpARM64FNMADDD, + ssa.OpARM64FMSUBS, + ssa.OpARM64FMSUBD, + ssa.OpARM64FNMSUBS, + ssa.OpARM64FNMSUBD: + rt := v.Reg() + ra := v.Args[0].Reg() + rm := v.Args[1].Reg() + rn := v.Args[2].Reg() + p := s.Prog(v.Op.Asm()) + p.Reg = ra + p.From.Type = obj.TYPE_REG + p.From.Reg = rm + p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: rn}) + p.To.Type = obj.TYPE_REG + p.To.Reg = rt case ssa.OpARM64ADDconst, ssa.OpARM64SUBconst, ssa.OpARM64ANDconst, diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index 5132e42b6d..3822a378be 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -1448,3 +1448,15 @@ (FNEGD (FNMULD x y)) -> (FMULD x y) (FNMULS (FNEGS x) y) -> (FMULS x y) (FNMULD (FNEGD x) y) -> (FMULD x y) +(FADDS a (FMULS x y)) -> (FMADDS a x y) +(FADDD a (FMULD x y)) -> (FMADDD a x y) +(FSUBS a (FMULS x y)) -> (FMSUBS a x y) +(FSUBD a (FMULD x y)) -> (FMSUBD a x y) +(FSUBS (FMULS x y) a) -> (FNMSUBS a x y) +(FSUBD (FMULD x y) a) -> (FNMSUBD a x y) +(FADDS a (FNMULS x y)) -> (FMSUBS a x y) +(FADDD a (FNMULD x y)) -> (FMSUBD a x y) +(FSUBS a (FNMULS x y)) -> (FMADDS a x y) +(FSUBD a (FNMULD x y)) -> (FMADDD a x y) +(FSUBS (FNMULS x y) a) -> (FNMADDS a x y) +(FSUBD (FNMULD x y) a) -> (FNMADDD a x y) diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go index 6acc9c89f2..1d70c4e864 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go @@ -152,6 +152,7 @@ func init() { fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}} gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}} fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}} + fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}} fp2flags = regInfo{inputs: []regMask{fp, fp}} fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}} fpstore = regInfo{inputs: []regMask{gpspsbg, fp}} @@ -216,6 +217,16 @@ func init() { {name: "VCNT", argLength: 1, reg: fp11, asm: "VCNT"}, // count set bits for each 8-bit unit and store the result in each 8-bit unit {name: "VUADDLV", argLength: 1, reg: fp11, asm: "VUADDLV"}, // unsigned sum of eight bytes in a 64-bit value, zero extended to 64-bit. + // 3-operand, the addend comes first + {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"}, // +arg0 + (arg1 * arg2) + {name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD"}, // +arg0 + (arg1 * arg2) + {name: "FNMADDS", argLength: 3, reg: fp31, asm: "FNMADDS"}, // -arg0 - (arg1 * arg2) + {name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD"}, // -arg0 - (arg1 * arg2) + {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"}, // +arg0 - (arg1 * arg2) + {name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD"}, // +arg0 - (arg1 * arg2) + {name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS"}, // -arg0 + (arg1 * arg2) + {name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD"}, // -arg0 + (arg1 * arg2) + // shifts {name: "SLL", argLength: 2, reg: gp21, asm: "LSL"}, // arg0 << arg1, shift amount is mod 64 {name: "SLLconst", argLength: 1, reg: gp11, asm: "LSL", aux: "Int64"}, // arg0 << auxInt diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index ceb57a86ba..5131e8d834 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1003,6 +1003,14 @@ const ( OpARM64CLZW OpARM64VCNT OpARM64VUADDLV + OpARM64FMADDS + OpARM64FMADDD + OpARM64FNMADDS + OpARM64FNMADDD + OpARM64FMSUBS + OpARM64FMSUBD + OpARM64FNMSUBS + OpARM64FNMSUBD OpARM64SLL OpARM64SLLconst OpARM64SRL @@ -12757,6 +12765,126 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "FMADDS", + argLen: 3, + asm: arm64.AFMADDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FMADDD", + argLen: 3, + asm: arm64.AFMADDD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMADDS", + argLen: 3, + asm: arm64.AFNMADDS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMADDD", + argLen: 3, + asm: arm64.AFNMADDD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FMSUBS", + argLen: 3, + asm: arm64.AFMSUBS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FMSUBD", + argLen: 3, + asm: arm64.AFMSUBD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMSUBS", + argLen: 3, + asm: arm64.AFNMSUBS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMSUBD", + argLen: 3, + asm: arm64.AFNMSUBD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, { name: "SLL", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index b5b74c258c..f711aade36 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -69,6 +69,10 @@ func rewriteValueARM64(v *Value) bool { return rewriteValueARM64_OpARM64DIVW_0(v) case OpARM64Equal: return rewriteValueARM64_OpARM64Equal_0(v) + case OpARM64FADDD: + return rewriteValueARM64_OpARM64FADDD_0(v) + case OpARM64FADDS: + return rewriteValueARM64_OpARM64FADDS_0(v) case OpARM64FMOVDgpfp: return rewriteValueARM64_OpARM64FMOVDgpfp_0(v) case OpARM64FMOVDload: @@ -91,6 +95,10 @@ func rewriteValueARM64(v *Value) bool { return rewriteValueARM64_OpARM64FNMULD_0(v) case OpARM64FNMULS: return rewriteValueARM64_OpARM64FNMULS_0(v) + case OpARM64FSUBD: + return rewriteValueARM64_OpARM64FSUBD_0(v) + case OpARM64FSUBS: + return rewriteValueARM64_OpARM64FSUBS_0(v) case OpARM64GreaterEqual: return rewriteValueARM64_OpARM64GreaterEqual_0(v) case OpARM64GreaterEqualU: @@ -2972,6 +2980,164 @@ func rewriteValueARM64_OpARM64Equal_0(v *Value) bool { } return false } +func rewriteValueARM64_OpARM64FADDD_0(v *Value) bool { + // match: (FADDD a (FMULD x y)) + // cond: + // result: (FMADDD a x y) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FMULD { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + v.reset(OpARM64FMADDD) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FADDD (FMULD x y) a) + // cond: + // result: (FMADDD a x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FMULD { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + v.reset(OpARM64FMADDD) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FADDD a (FNMULD x y)) + // cond: + // result: (FMSUBD a x y) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FNMULD { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + v.reset(OpARM64FMSUBD) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FADDD (FNMULD x y) a) + // cond: + // result: (FMSUBD a x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FNMULD { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + v.reset(OpARM64FMSUBD) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} +func rewriteValueARM64_OpARM64FADDS_0(v *Value) bool { + // match: (FADDS a (FMULS x y)) + // cond: + // result: (FMADDS a x y) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FMULS { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + v.reset(OpARM64FMADDS) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FADDS (FMULS x y) a) + // cond: + // result: (FMADDS a x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FMULS { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + v.reset(OpARM64FMADDS) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FADDS a (FNMULS x y)) + // cond: + // result: (FMSUBS a x y) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FNMULS { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + v.reset(OpARM64FMSUBS) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FADDS (FNMULS x y) a) + // cond: + // result: (FMSUBS a x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FNMULS { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + v.reset(OpARM64FMSUBS) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} func rewriteValueARM64_OpARM64FMOVDgpfp_0(v *Value) bool { b := v.Block _ = b @@ -3456,6 +3622,164 @@ func rewriteValueARM64_OpARM64FNMULS_0(v *Value) bool { } return false } +func rewriteValueARM64_OpARM64FSUBD_0(v *Value) bool { + // match: (FSUBD a (FMULD x y)) + // cond: + // result: (FMSUBD a x y) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FMULD { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + v.reset(OpARM64FMSUBD) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FSUBD (FMULD x y) a) + // cond: + // result: (FNMSUBD a x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FMULD { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + v.reset(OpARM64FNMSUBD) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FSUBD a (FNMULD x y)) + // cond: + // result: (FMADDD a x y) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FNMULD { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + v.reset(OpARM64FMADDD) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FSUBD (FNMULD x y) a) + // cond: + // result: (FNMADDD a x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FNMULD { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + v.reset(OpARM64FNMADDD) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} +func rewriteValueARM64_OpARM64FSUBS_0(v *Value) bool { + // match: (FSUBS a (FMULS x y)) + // cond: + // result: (FMSUBS a x y) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FMULS { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + v.reset(OpARM64FMSUBS) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FSUBS (FMULS x y) a) + // cond: + // result: (FNMSUBS a x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FMULS { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + v.reset(OpARM64FNMSUBS) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FSUBS a (FNMULS x y)) + // cond: + // result: (FMADDS a x y) + for { + _ = v.Args[1] + a := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FNMULS { + break + } + _ = v_1.Args[1] + x := v_1.Args[0] + y := v_1.Args[1] + v.reset(OpARM64FMADDS) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FSUBS (FNMULS x y) a) + // cond: + // result: (FNMADDS a x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FNMULS { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + a := v.Args[1] + v.reset(OpARM64FNMADDS) + v.AddArg(a) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} func rewriteValueARM64_OpARM64GreaterEqual_0(v *Value) bool { // match: (GreaterEqual (FlagEQ)) // cond: