]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: improve FP performance on ARM64
authorBen Shi <powerman1st@163.com>
Sat, 17 Feb 2018 12:57:44 +0000 (12:57 +0000)
committerBrad Fitzpatrick <bradfitz@golang.org>
Thu, 22 Feb 2018 04:10:07 +0000 (04:10 +0000)
FMADD/FMSUB/FNMADD/FNMSUB are efficient FP instructions, which can
be used by the comiler to improve FP performance. This CL implements
this optimization.

1. The compilecmp benchmark shows little change.
name        old time/op       new time/op       delta
Template          2.35s ± 4%        2.38s ± 4%    ~     (p=0.161 n=15+15)
Unicode           1.36s ± 5%        1.36s ± 4%    ~     (p=0.685 n=14+13)
GoTypes           8.11s ± 3%        8.13s ± 2%    ~     (p=0.624 n=15+15)
Compiler          40.5s ± 2%        40.7s ± 2%    ~     (p=0.137 n=15+15)
SSA                115s ± 3%         116s ± 1%    ~     (p=0.270 n=15+14)
Flate             1.46s ± 4%        1.45s ± 5%    ~     (p=0.870 n=15+15)
GoParser          1.85s ± 2%        1.87s ± 3%    ~     (p=0.477 n=14+15)
Reflect           5.11s ± 4%        5.10s ± 2%    ~     (p=0.624 n=15+15)
Tar               2.23s ± 3%        2.23s ± 5%    ~     (p=0.624 n=15+15)
XML               2.72s ± 5%        2.74s ± 3%    ~     (p=0.290 n=15+14)
[Geo mean]        5.02s             5.03s       +0.29%

name        old user-time/op  new user-time/op  delta
Template          2.90s ± 2%        2.90s ± 3%    ~     (p=0.780 n=14+15)
Unicode           1.71s ± 5%        1.70s ± 3%    ~     (p=0.458 n=14+13)
GoTypes           9.77s ± 2%        9.76s ± 2%    ~     (p=0.838 n=15+15)
Compiler          49.1s ± 2%        49.1s ± 2%    ~     (p=0.902 n=15+15)
SSA                144s ± 1%         144s ± 2%    ~     (p=0.567 n=15+15)
Flate             1.75s ± 5%        1.74s ± 3%    ~     (p=0.461 n=15+15)
GoParser          2.22s ± 2%        2.21s ± 3%    ~     (p=0.233 n=15+15)
Reflect           5.99s ± 2%        5.95s ± 1%    ~     (p=0.093 n=14+15)
Tar               2.68s ± 2%        2.67s ± 3%    ~     (p=0.310 n=14+15)
XML               3.22s ± 2%        3.24s ± 3%    ~     (p=0.512 n=15+15)
[Geo mean]        6.08s             6.07s       -0.19%

name        old text-bytes    new text-bytes    delta
HelloSize         641kB ± 0%        641kB ± 0%    ~     (all equal)

name        old data-bytes    new data-bytes    delta
HelloSize        9.46kB ± 0%       9.46kB ± 0%    ~     (all equal)

name        old bss-bytes     new bss-bytes     delta
HelloSize         125kB ± 0%        125kB ± 0%    ~     (all equal)

name        old exe-bytes     new exe-bytes     delta
HelloSize        1.24MB ± 0%       1.24MB ± 0%    ~     (all equal)

2. The go1 benchmark shows little improvement in total (excluding noise),
but some improvement in test case Mandelbrot200 and FmtFprintfFloat.
name                     old time/op    new time/op    delta
BinaryTree17-4              42.1s ± 2%     42.0s ± 2%    ~     (p=0.453 n=30+28)
Fannkuch11-4                33.5s ± 3%     33.3s ± 3%  -0.38%  (p=0.045 n=30+30)
FmtFprintfEmpty-4           534ns ± 0%     534ns ± 0%    ~     (all equal)
FmtFprintfString-4         1.09µs ± 0%    1.09µs ± 0%  -0.27%  (p=0.000 n=23+17)
FmtFprintfInt-4            1.16µs ± 3%    1.16µs ± 3%    ~     (p=0.714 n=30+30)
FmtFprintfIntInt-4         1.76µs ± 1%    1.77µs ± 0%  +0.15%  (p=0.002 n=23+23)
FmtFprintfPrefixedInt-4    2.21µs ± 3%    2.20µs ± 3%    ~     (p=0.390 n=30+30)
FmtFprintfFloat-4          3.28µs ± 0%    3.11µs ± 0%  -5.01%  (p=0.000 n=25+26)
FmtManyArgs-4              7.18µs ± 0%    7.19µs ± 0%  +0.13%  (p=0.000 n=24+25)
GobDecode-4                94.9ms ± 0%    95.6ms ± 5%  +0.83%  (p=0.002 n=23+29)
GobEncode-4                80.7ms ± 4%    79.8ms ± 0%  -1.11%  (p=0.003 n=30+24)
Gzip-4                      4.58s ± 4%     4.59s ± 3%  +0.26%  (p=0.002 n=30+26)
Gunzip-4                    449ms ± 4%     443ms ± 0%    ~     (p=0.096 n=30+26)
HTTPClientServer-4          553µs ± 1%     548µs ± 1%  -0.96%  (p=0.000 n=30+30)
JSONEncode-4                215ms ± 4%     214ms ± 4%  -0.29%  (p=0.000 n=30+30)
JSONDecode-4                868ms ± 4%     875ms ± 5%  +0.79%  (p=0.008 n=30+30)
Mandelbrot200-4            51.4ms ± 0%    46.7ms ± 3%  -9.09%  (p=0.000 n=25+26)
GoParse-4                  42.1ms ± 0%    41.8ms ± 0%  -0.61%  (p=0.000 n=25+24)
RegexpMatchEasy0_32-4      1.02µs ± 4%    1.02µs ± 4%  -0.17%  (p=0.000 n=30+30)
RegexpMatchEasy0_1K-4      3.90µs ± 0%    3.95µs ± 4%    ~     (p=0.516 n=23+30)
RegexpMatchEasy1_32-4       970ns ± 3%     973ns ± 3%    ~     (p=0.951 n=30+30)
RegexpMatchEasy1_1K-4      6.43µs ± 3%    6.33µs ± 0%  -1.62%  (p=0.000 n=30+25)
RegexpMatchMedium_32-4     1.75µs ± 0%    1.75µs ± 0%    ~     (p=0.422 n=25+24)
RegexpMatchMedium_1K-4      568µs ± 3%     562µs ± 0%    ~     (p=0.079 n=30+24)
RegexpMatchHard_32-4       30.8µs ± 0%    31.2µs ± 4%  +1.46%  (p=0.018 n=23+30)
RegexpMatchHard_1K-4        932µs ± 0%     946µs ± 3%  +1.49%  (p=0.000 n=24+30)
Revcomp-4                   7.69s ± 3%     7.69s ± 2%  +0.04%  (p=0.032 n=24+25)
Template-4                  893ms ± 5%     880ms ± 6%  -1.53%  (p=0.000 n=30+30)
TimeParse-4                4.90µs ± 3%    4.84µs ± 0%    ~     (p=0.080 n=30+25)
TimeFormat-4               4.70µs ± 1%    4.76µs ± 0%  +1.21%  (p=0.000 n=23+26)
[Geo mean]                  710µs          706µs       -0.63%

name                     old speed      new speed      delta
GobDecode-4              8.09MB/s ± 0%  8.03MB/s ± 5%  -0.77%  (p=0.002 n=23+29)
GobEncode-4              9.52MB/s ± 4%  9.62MB/s ± 0%  +1.07%  (p=0.003 n=30+24)
Gzip-4                   4.24MB/s ± 4%  4.23MB/s ± 3%  -0.35%  (p=0.002 n=30+26)
Gunzip-4                 43.2MB/s ± 4%  43.8MB/s ± 0%    ~     (p=0.123 n=30+26)
JSONEncode-4             9.03MB/s ± 4%  9.06MB/s ± 4%  +0.28%  (p=0.000 n=30+30)
JSONDecode-4             2.24MB/s ± 4%  2.22MB/s ± 5%  -0.79%  (p=0.008 n=30+30)
GoParse-4                1.38MB/s ± 1%  1.38MB/s ± 0%    ~     (p=0.401 n=25+17)
RegexpMatchEasy0_32-4    31.4MB/s ± 4%  31.5MB/s ± 3%  +0.16%  (p=0.000 n=30+30)
RegexpMatchEasy0_1K-4     262MB/s ± 0%   259MB/s ± 4%    ~     (p=0.693 n=23+30)
RegexpMatchEasy1_32-4    33.0MB/s ± 3%  32.9MB/s ± 3%    ~     (p=0.139 n=30+30)
RegexpMatchEasy1_1K-4     159MB/s ± 3%   162MB/s ± 0%  +1.60%  (p=0.000 n=30+25)
RegexpMatchMedium_32-4    570kB/s ± 0%   570kB/s ± 0%    ~     (all equal)
RegexpMatchMedium_1K-4   1.80MB/s ± 3%  1.82MB/s ± 0%  +1.09%  (p=0.007 n=30+24)
RegexpMatchHard_32-4     1.04MB/s ± 0%  1.03MB/s ± 3%  -1.38%  (p=0.003 n=23+30)
RegexpMatchHard_1K-4     1.10MB/s ± 0%  1.08MB/s ± 3%  -1.52%  (p=0.000 n=24+30)
Revcomp-4                33.0MB/s ± 3%  33.0MB/s ± 2%    ~     (p=0.128 n=24+25)
Template-4               2.17MB/s ± 5%  2.21MB/s ± 6%  +1.61%  (p=0.000 n=30+30)
[Geo mean]               7.79MB/s       7.79MB/s       +0.05%

Change-Id: Ied3dbdb5ba8e386168629cba06fcd4263bbb83e1
Reviewed-on: https://go-review.googlesource.com/94901
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
src/cmd/compile/internal/arm64/ssa.go
src/cmd/compile/internal/ssa/gen/ARM64.rules
src/cmd/compile/internal/ssa/gen/ARM64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteARM64.go

index 574fb9ff735ead18b97dcbed201dd7622c2df0cc..795b1a74c5fb42949c7c36f34c5da138baaeff00 100644 (file)
@@ -186,6 +186,25 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.Reg = r1
                p.To.Type = obj.TYPE_REG
                p.To.Reg = r
+       case ssa.OpARM64FMADDS,
+               ssa.OpARM64FMADDD,
+               ssa.OpARM64FNMADDS,
+               ssa.OpARM64FNMADDD,
+               ssa.OpARM64FMSUBS,
+               ssa.OpARM64FMSUBD,
+               ssa.OpARM64FNMSUBS,
+               ssa.OpARM64FNMSUBD:
+               rt := v.Reg()
+               ra := v.Args[0].Reg()
+               rm := v.Args[1].Reg()
+               rn := v.Args[2].Reg()
+               p := s.Prog(v.Op.Asm())
+               p.Reg = ra
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = rm
+               p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: rn})
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = rt
        case ssa.OpARM64ADDconst,
                ssa.OpARM64SUBconst,
                ssa.OpARM64ANDconst,
index 5132e42b6df553c691c1f91d6fbb379611ed9a01..3822a378be6f7b1c15aaed4cefb0191806d59687 100644 (file)
 (FNEGD (FNMULD x y)) -> (FMULD x y)
 (FNMULS (FNEGS x) y) -> (FMULS x y)
 (FNMULD (FNEGD x) y) -> (FMULD x y)
+(FADDS a (FMULS x y)) -> (FMADDS a x y)
+(FADDD a (FMULD x y)) -> (FMADDD a x y)
+(FSUBS a (FMULS x y)) -> (FMSUBS a x y)
+(FSUBD a (FMULD x y)) -> (FMSUBD a x y)
+(FSUBS (FMULS x y) a) -> (FNMSUBS a x y)
+(FSUBD (FMULD x y) a) -> (FNMSUBD a x y)
+(FADDS a (FNMULS x y)) -> (FMSUBS a x y)
+(FADDD a (FNMULD x y)) -> (FMSUBD a x y)
+(FSUBS a (FNMULS x y)) -> (FMADDS a x y)
+(FSUBD a (FNMULD x y)) -> (FMADDD a x y)
+(FSUBS (FNMULS x y) a) -> (FNMADDS a x y)
+(FSUBD (FNMULD x y) a) -> (FNMADDD a x y)
index 6acc9c89f2be5aba2880c7e099cb5a02897111f1..1d70c4e8648a28c13fc6a01753a7aafb4e055b4b 100644 (file)
@@ -152,6 +152,7 @@ func init() {
                fpgp      = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
                gpfp      = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
                fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+               fp31      = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
                fp2flags  = regInfo{inputs: []regMask{fp, fp}}
                fpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
                fpstore   = regInfo{inputs: []regMask{gpspsbg, fp}}
@@ -216,6 +217,16 @@ func init() {
                {name: "VCNT", argLength: 1, reg: fp11, asm: "VCNT"},       // count set bits for each 8-bit unit and store the result in each 8-bit unit
                {name: "VUADDLV", argLength: 1, reg: fp11, asm: "VUADDLV"}, // unsigned sum of eight bytes in a 64-bit value, zero extended to 64-bit.
 
+               // 3-operand, the addend comes first
+               {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"},   // +arg0 + (arg1 * arg2)
+               {name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD"},   // +arg0 + (arg1 * arg2)
+               {name: "FNMADDS", argLength: 3, reg: fp31, asm: "FNMADDS"}, // -arg0 - (arg1 * arg2)
+               {name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD"}, // -arg0 - (arg1 * arg2)
+               {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"},   // +arg0 - (arg1 * arg2)
+               {name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD"},   // +arg0 - (arg1 * arg2)
+               {name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS"}, // -arg0 + (arg1 * arg2)
+               {name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD"}, // -arg0 + (arg1 * arg2)
+
                // shifts
                {name: "SLL", argLength: 2, reg: gp21, asm: "LSL"},                      // arg0 << arg1, shift amount is mod 64
                {name: "SLLconst", argLength: 1, reg: gp11, asm: "LSL", aux: "Int64"},   // arg0 << auxInt
index ceb57a86ba8eb0c625ab5da2ace623be202c6a3c..5131e8d834d00b804862719b9b4e0b0ab2019396 100644 (file)
@@ -1003,6 +1003,14 @@ const (
        OpARM64CLZW
        OpARM64VCNT
        OpARM64VUADDLV
+       OpARM64FMADDS
+       OpARM64FMADDD
+       OpARM64FNMADDS
+       OpARM64FNMADDD
+       OpARM64FMSUBS
+       OpARM64FMSUBD
+       OpARM64FNMSUBS
+       OpARM64FNMSUBD
        OpARM64SLL
        OpARM64SLLconst
        OpARM64SRL
@@ -12757,6 +12765,126 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:   "FMADDS",
+               argLen: 3,
+               asm:    arm64.AFMADDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:   "FMADDD",
+               argLen: 3,
+               asm:    arm64.AFMADDD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:   "FNMADDS",
+               argLen: 3,
+               asm:    arm64.AFNMADDS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:   "FNMADDD",
+               argLen: 3,
+               asm:    arm64.AFNMADDD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:   "FMSUBS",
+               argLen: 3,
+               asm:    arm64.AFMSUBS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:   "FMSUBD",
+               argLen: 3,
+               asm:    arm64.AFMSUBD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:   "FNMSUBS",
+               argLen: 3,
+               asm:    arm64.AFNMSUBS,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:   "FNMSUBD",
+               argLen: 3,
+               asm:    arm64.AFNMSUBD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
        {
                name:   "SLL",
                argLen: 2,
index b5b74c258ce4f942b0ac99925f9b246364a9f0a7..f711aade36618e9baf14760f09d6eda07b941f17 100644 (file)
@@ -69,6 +69,10 @@ func rewriteValueARM64(v *Value) bool {
                return rewriteValueARM64_OpARM64DIVW_0(v)
        case OpARM64Equal:
                return rewriteValueARM64_OpARM64Equal_0(v)
+       case OpARM64FADDD:
+               return rewriteValueARM64_OpARM64FADDD_0(v)
+       case OpARM64FADDS:
+               return rewriteValueARM64_OpARM64FADDS_0(v)
        case OpARM64FMOVDgpfp:
                return rewriteValueARM64_OpARM64FMOVDgpfp_0(v)
        case OpARM64FMOVDload:
@@ -91,6 +95,10 @@ func rewriteValueARM64(v *Value) bool {
                return rewriteValueARM64_OpARM64FNMULD_0(v)
        case OpARM64FNMULS:
                return rewriteValueARM64_OpARM64FNMULS_0(v)
+       case OpARM64FSUBD:
+               return rewriteValueARM64_OpARM64FSUBD_0(v)
+       case OpARM64FSUBS:
+               return rewriteValueARM64_OpARM64FSUBS_0(v)
        case OpARM64GreaterEqual:
                return rewriteValueARM64_OpARM64GreaterEqual_0(v)
        case OpARM64GreaterEqualU:
@@ -2972,6 +2980,164 @@ func rewriteValueARM64_OpARM64Equal_0(v *Value) bool {
        }
        return false
 }
+func rewriteValueARM64_OpARM64FADDD_0(v *Value) bool {
+       // match: (FADDD a (FMULD x y))
+       // cond:
+       // result: (FMADDD a x y)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64FMULD {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               v.reset(OpARM64FMADDD)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FADDD (FMULD x y) a)
+       // cond:
+       // result: (FMADDD a x y)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64FMULD {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               v.reset(OpARM64FMADDD)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FADDD a (FNMULD x y))
+       // cond:
+       // result: (FMSUBD a x y)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64FNMULD {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               v.reset(OpARM64FMSUBD)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FADDD (FNMULD x y) a)
+       // cond:
+       // result: (FMSUBD a x y)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64FNMULD {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               v.reset(OpARM64FMSUBD)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       return false
+}
+func rewriteValueARM64_OpARM64FADDS_0(v *Value) bool {
+       // match: (FADDS a (FMULS x y))
+       // cond:
+       // result: (FMADDS a x y)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64FMULS {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               v.reset(OpARM64FMADDS)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FADDS (FMULS x y) a)
+       // cond:
+       // result: (FMADDS a x y)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64FMULS {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               v.reset(OpARM64FMADDS)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FADDS a (FNMULS x y))
+       // cond:
+       // result: (FMSUBS a x y)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64FNMULS {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               v.reset(OpARM64FMSUBS)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FADDS (FNMULS x y) a)
+       // cond:
+       // result: (FMSUBS a x y)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64FNMULS {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               v.reset(OpARM64FMSUBS)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       return false
+}
 func rewriteValueARM64_OpARM64FMOVDgpfp_0(v *Value) bool {
        b := v.Block
        _ = b
@@ -3456,6 +3622,164 @@ func rewriteValueARM64_OpARM64FNMULS_0(v *Value) bool {
        }
        return false
 }
+func rewriteValueARM64_OpARM64FSUBD_0(v *Value) bool {
+       // match: (FSUBD a (FMULD x y))
+       // cond:
+       // result: (FMSUBD a x y)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64FMULD {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               v.reset(OpARM64FMSUBD)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FSUBD (FMULD x y) a)
+       // cond:
+       // result: (FNMSUBD a x y)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64FMULD {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               v.reset(OpARM64FNMSUBD)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FSUBD a (FNMULD x y))
+       // cond:
+       // result: (FMADDD a x y)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64FNMULD {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               v.reset(OpARM64FMADDD)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FSUBD (FNMULD x y) a)
+       // cond:
+       // result: (FNMADDD a x y)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64FNMULD {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               v.reset(OpARM64FNMADDD)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       return false
+}
+func rewriteValueARM64_OpARM64FSUBS_0(v *Value) bool {
+       // match: (FSUBS a (FMULS x y))
+       // cond:
+       // result: (FMSUBS a x y)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64FMULS {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               v.reset(OpARM64FMSUBS)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FSUBS (FMULS x y) a)
+       // cond:
+       // result: (FNMSUBS a x y)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64FMULS {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               v.reset(OpARM64FNMSUBS)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FSUBS a (FNMULS x y))
+       // cond:
+       // result: (FMADDS a x y)
+       for {
+               _ = v.Args[1]
+               a := v.Args[0]
+               v_1 := v.Args[1]
+               if v_1.Op != OpARM64FNMULS {
+                       break
+               }
+               _ = v_1.Args[1]
+               x := v_1.Args[0]
+               y := v_1.Args[1]
+               v.reset(OpARM64FMADDS)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       // match: (FSUBS (FNMULS x y) a)
+       // cond:
+       // result: (FNMADDS a x y)
+       for {
+               _ = v.Args[1]
+               v_0 := v.Args[0]
+               if v_0.Op != OpARM64FNMULS {
+                       break
+               }
+               _ = v_0.Args[1]
+               x := v_0.Args[0]
+               y := v_0.Args[1]
+               a := v.Args[1]
+               v.reset(OpARM64FNMADDS)
+               v.AddArg(a)
+               v.AddArg(x)
+               v.AddArg(y)
+               return true
+       }
+       return false
+}
 func rewriteValueARM64_OpARM64GreaterEqual_0(v *Value) bool {
        // match: (GreaterEqual (FlagEQ))
        // cond: