]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: implement FMA codegen for loong64
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>
Tue, 5 Nov 2024 07:30:45 +0000 (15:30 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Fri, 8 Nov 2024 01:05:48 +0000 (01:05 +0000)
Benchmark results on Loongson 3A5000 and 3A6000:

goos: linux
goarch: loong64
pkg: math
cpu: Loongson-3A6000 @ 2500.00MHz
    |  bench.old   |              bench.new              |
    |    sec/op    |   sec/op     vs base                |
FMA   25.930n ± 0%   2.002n ± 0%  -92.28% (p=0.000 n=10)

goos: linux
goarch: loong64
pkg: math
cpu: Loongson-3A5000 @ 2500.00MHz
    |  bench.old   |              bench.new              |
    |    sec/op    |   sec/op     vs base                |
FMA   32.840n ± 0%   2.002n ± 0%  -93.90% (p=0.000 n=10)

Updates #59120

This patch is a copy of CL 483355.
Co-authored-by: WANG Xuerui <git@xen0n.name>
Change-Id: I88b89d23f00864f9173a182a47ee135afec7ed6e
Reviewed-on: https://go-review.googlesource.com/c/go/+/625335
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
src/cmd/compile/internal/loong64/ssa.go
src/cmd/compile/internal/ssa/_gen/LOONG64.rules
src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteLOONG64.go
src/cmd/compile/internal/ssagen/intrinsics.go
src/cmd/compile/internal/ssagen/intrinsics_test.go
test/codegen/floats.go
test/codegen/math.go

index 02286b8de82f9436659c37febbc6f35eb52dbe97..c49fee680842e1666b130e2b242649f68aab6ffa 100644 (file)
@@ -123,7 +123,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p.From.Reg = x
                p.To.Type = obj.TYPE_REG
                p.To.Reg = y
-       case ssa.OpLOONG64MOVVnop:
+       case ssa.OpLOONG64MOVVnop,
+               ssa.OpLOONG64LoweredRound32F,
+               ssa.OpLOONG64LoweredRound64F:
                // nothing to do
        case ssa.OpLoadReg:
                if v.Type.IsFlags() {
@@ -320,6 +322,30 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p.Reg = v.Args[1].Reg()
                p.To.Type = obj.TYPE_REG
                p.To.Reg = loong64.REG_FCC0
+
+       case ssa.OpLOONG64FMADDF,
+               ssa.OpLOONG64FMADDD,
+               ssa.OpLOONG64FMSUBF,
+               ssa.OpLOONG64FMSUBD,
+               ssa.OpLOONG64FNMADDF,
+               ssa.OpLOONG64FNMADDD,
+               ssa.OpLOONG64FNMSUBF,
+               ssa.OpLOONG64FNMSUBD:
+               p := s.Prog(v.Op.Asm())
+               // r=(FMA x y z) -> FMADDD z, y, x, r
+               // the SSA operand order is for taking advantage of
+               // commutativity (that only applies for the first two operands)
+               r := v.Reg()
+               x := v.Args[0].Reg()
+               y := v.Args[1].Reg()
+               z := v.Args[2].Reg()
+               p.From.Type = obj.TYPE_REG
+               p.From.Reg = z
+               p.Reg = y
+               p.AddRestSourceReg(x)
+               p.To.Type = obj.TYPE_REG
+               p.To.Reg = r
+
        case ssa.OpLOONG64MOVVaddr:
                p := s.Prog(loong64.AMOVV)
                p.From.Type = obj.TYPE_ADDR
index 69119f1d96d46026918ce1862e1338095282f00d..ef7cfdf3964aa910b34c5c8dee6bde7535584734 100644 (file)
 
 (CvtBoolToUint8 ...) => (Copy ...)
 
-(Round(32|64)F ...) => (Copy ...)
+(Round(32|64)F ...) => (LoweredRound(32|64)F ...)
 
 // comparisons
 (Eq8 x y)  => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)))
 (REMVU _ (MOVVconst [1])) => (MOVVconst [0])                       // mod
 (REMVU x (MOVVconst [c])) && isPowerOfTwo(c) => (ANDconst [c-1] x) // mod
 
+// FMA
+(FMA ...) => (FMADDD ...)
+((ADD|SUB)F (MULF x y) z) && z.Block.Func.useFMA(v) => (FM(ADD|SUB)F x y z)
+((ADD|SUB)D (MULD x y) z) && z.Block.Func.useFMA(v) => (FM(ADD|SUB)D x y z)
+// z - xy -> -(xy - z)
+(SUBF z (MULF x y)) && z.Block.Func.useFMA(v) => (FNMSUBF x y z)
+(SUBD z (MULD x y)) && z.Block.Func.useFMA(v) => (FNMSUBD x y z)
+// z + (-xy) -> -(xy - z)
+// z - (-xy) -> xy + z
+((ADD|SUB)F z (NEGF (MULF x y))) && z.Block.Func.useFMA(v) => (F(NMSUB|MADD)F x y z)
+((ADD|SUB)D z (NEGD (MULD x y))) && z.Block.Func.useFMA(v) => (F(NMSUB|MADD)D x y z)
+// -xy - z -> -(xy + z)
+(SUBF (NEGF (MULF x y)) z) && z.Block.Func.useFMA(v) => (FNMADDF x y z)
+(SUBD (NEGD (MULD x y)) z) && z.Block.Func.useFMA(v) => (FNMADDD x y z)
+
 // generic simplifications
 (ADDV x (NEGV y)) => (SUBV x y)
 (SUBV x x) => (MOVVconst [0])
index 5789760683c43a454592a2f881243bde001255cd..465e724a194715d15abbca9e60890a00eb6cd2ac 100644 (file)
@@ -151,6 +151,7 @@ func init() {
                fp01      = regInfo{inputs: nil, outputs: []regMask{fp}}
                fp11      = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
                fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+               fp31      = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
                fp2flags  = regInfo{inputs: []regMask{fp, fp}}
                fpload    = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
                fp2load   = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{fp}}
@@ -193,6 +194,15 @@ func init() {
                {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true},                // ^(arg0 | arg1)
                {name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"},                // ^(arg0 | auxInt)
 
+               {name: "FMADDF", argLength: 3, reg: fp31, asm: "FMADDF", commutative: true, typ: "Float32"},   // (arg0 * arg1) + arg2
+               {name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD", commutative: true, typ: "Float64"},   // (arg0 * arg1) + arg2
+               {name: "FMSUBF", argLength: 3, reg: fp31, asm: "FMSUBF", commutative: true, typ: "Float32"},   // (arg0 * arg1) - arg2
+               {name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD", commutative: true, typ: "Float64"},   // (arg0 * arg1) - arg2
+               {name: "FNMADDF", argLength: 3, reg: fp31, asm: "FNMADDF", commutative: true, typ: "Float32"}, // -((arg0 * arg1) + arg2)
+               {name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD", commutative: true, typ: "Float64"}, // -((arg0 * arg1) + arg2)
+               {name: "FNMSUBF", argLength: 3, reg: fp31, asm: "FNMSUBF", commutative: true, typ: "Float32"}, // -((arg0 * arg1) - arg2)
+               {name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD", commutative: true, typ: "Float64"}, // -((arg0 * arg1) - arg2)
+
                {name: "NEGV", argLength: 1, reg: gp11},                // -arg0
                {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"},   // -arg0, float32
                {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"},   // -arg0, float64
@@ -330,6 +340,10 @@ func init() {
                {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"},     // float32 -> float64
                {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"},     // float64 -> float32
 
+               // Round ops to block fused-multiply-add extraction.
+               {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true},
+               {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true},
+
                // function calls
                {name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true},                                               // call static function aux.(*obj.LSym).  last arg=mem, auxint=argsize, returns mem
                {name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true},                                 // tail call static function aux.(*obj.LSym).  last arg=mem, auxint=argsize, returns mem
index b18a4385d2be4dc0e8e2927135c013e4b56947c1..bcc358db50803821893aea2f83c61865237ea6a4 100644 (file)
@@ -1783,6 +1783,14 @@ const (
        OpLOONG64XORconst
        OpLOONG64NOR
        OpLOONG64NORconst
+       OpLOONG64FMADDF
+       OpLOONG64FMADDD
+       OpLOONG64FMSUBF
+       OpLOONG64FMSUBD
+       OpLOONG64FNMADDF
+       OpLOONG64FNMADDD
+       OpLOONG64FNMSUBF
+       OpLOONG64FNMSUBD
        OpLOONG64NEGV
        OpLOONG64NEGF
        OpLOONG64NEGD
@@ -1887,6 +1895,8 @@ const (
        OpLOONG64TRUNCDV
        OpLOONG64MOVFD
        OpLOONG64MOVDF
+       OpLOONG64LoweredRound32F
+       OpLOONG64LoweredRound64F
        OpLOONG64CALLstatic
        OpLOONG64CALLtail
        OpLOONG64CALLclosure
@@ -23928,6 +23938,134 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:        "FMADDF",
+               argLen:      3,
+               commutative: true,
+               asm:         loong64.AFMADDF,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:        "FMADDD",
+               argLen:      3,
+               commutative: true,
+               asm:         loong64.AFMADDD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:        "FMSUBF",
+               argLen:      3,
+               commutative: true,
+               asm:         loong64.AFMSUBF,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:        "FMSUBD",
+               argLen:      3,
+               commutative: true,
+               asm:         loong64.AFMSUBD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:        "FNMADDF",
+               argLen:      3,
+               commutative: true,
+               asm:         loong64.AFNMADDF,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:        "FNMADDD",
+               argLen:      3,
+               commutative: true,
+               asm:         loong64.AFNMADDD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:        "FNMSUBF",
+               argLen:      3,
+               commutative: true,
+               asm:         loong64.AFNMSUBF,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:        "FNMSUBD",
+               argLen:      3,
+               commutative: true,
+               asm:         loong64.AFNMSUBD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                               {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
        {
                name:   "NEGV",
                argLen: 1,
@@ -25326,6 +25464,32 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "LoweredRound32F",
+               argLen:       1,
+               resultInArg0: true,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
+       {
+               name:         "LoweredRound64F",
+               argLen:       1,
+               resultInArg0: true,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+                       outputs: []outputInfo{
+                               {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+                       },
+               },
+       },
        {
                name:         "CALLstatic",
                auxType:      auxCallOff,
index fedcd196d469cb5f7d75175435c30c0bcad007b3..e8c1d26554aaa91791965a5496e5e703c9942017 100644 (file)
@@ -216,6 +216,9 @@ func rewriteValueLOONG64(v *Value) bool {
                return rewriteValueLOONG64_OpEqB(v)
        case OpEqPtr:
                return rewriteValueLOONG64_OpEqPtr(v)
+       case OpFMA:
+               v.Op = OpLOONG64FMADDD
+               return true
        case OpGetCallerPC:
                v.Op = OpLOONG64LoweredGetCallerPC
                return true
@@ -244,6 +247,10 @@ func rewriteValueLOONG64(v *Value) bool {
                return rewriteValueLOONG64_OpIsNonNil(v)
        case OpIsSliceInBounds:
                return rewriteValueLOONG64_OpIsSliceInBounds(v)
+       case OpLOONG64ADDD:
+               return rewriteValueLOONG64_OpLOONG64ADDD(v)
+       case OpLOONG64ADDF:
+               return rewriteValueLOONG64_OpLOONG64ADDF(v)
        case OpLOONG64ADDV:
                return rewriteValueLOONG64_OpLOONG64ADDV(v)
        case OpLOONG64ADDVconst:
@@ -392,6 +399,10 @@ func rewriteValueLOONG64(v *Value) bool {
                return rewriteValueLOONG64_OpLOONG64SRLV(v)
        case OpLOONG64SRLVconst:
                return rewriteValueLOONG64_OpLOONG64SRLVconst(v)
+       case OpLOONG64SUBD:
+               return rewriteValueLOONG64_OpLOONG64SUBD(v)
+       case OpLOONG64SUBF:
+               return rewriteValueLOONG64_OpLOONG64SUBF(v)
        case OpLOONG64SUBV:
                return rewriteValueLOONG64_OpLOONG64SUBV(v)
        case OpLOONG64SUBVconst:
@@ -596,10 +607,10 @@ func rewriteValueLOONG64(v *Value) bool {
        case OpRotateLeft8:
                return rewriteValueLOONG64_OpRotateLeft8(v)
        case OpRound32F:
-               v.Op = OpCopy
+               v.Op = OpLOONG64LoweredRound32F
                return true
        case OpRound64F:
-               v.Op = OpCopy
+               v.Op = OpLOONG64LoweredRound64F
                return true
        case OpRsh16Ux16:
                return rewriteValueLOONG64_OpRsh16Ux16(v)
@@ -1410,6 +1421,104 @@ func rewriteValueLOONG64_OpIsSliceInBounds(v *Value) bool {
                return true
        }
 }
+func rewriteValueLOONG64_OpLOONG64ADDD(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (ADDD (MULD x y) z)
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FMADDD x y z)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       if v_0.Op != OpLOONG64MULD {
+                               continue
+                       }
+                       y := v_0.Args[1]
+                       x := v_0.Args[0]
+                       z := v_1
+                       if !(z.Block.Func.useFMA(v)) {
+                               continue
+                       }
+                       v.reset(OpLOONG64FMADDD)
+                       v.AddArg3(x, y, z)
+                       return true
+               }
+               break
+       }
+       // match: (ADDD z (NEGD (MULD x y)))
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FNMSUBD x y z)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       z := v_0
+                       if v_1.Op != OpLOONG64NEGD {
+                               continue
+                       }
+                       v_1_0 := v_1.Args[0]
+                       if v_1_0.Op != OpLOONG64MULD {
+                               continue
+                       }
+                       y := v_1_0.Args[1]
+                       x := v_1_0.Args[0]
+                       if !(z.Block.Func.useFMA(v)) {
+                               continue
+                       }
+                       v.reset(OpLOONG64FNMSUBD)
+                       v.AddArg3(x, y, z)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValueLOONG64_OpLOONG64ADDF(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (ADDF (MULF x y) z)
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FMADDF x y z)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       if v_0.Op != OpLOONG64MULF {
+                               continue
+                       }
+                       y := v_0.Args[1]
+                       x := v_0.Args[0]
+                       z := v_1
+                       if !(z.Block.Func.useFMA(v)) {
+                               continue
+                       }
+                       v.reset(OpLOONG64FMADDF)
+                       v.AddArg3(x, y, z)
+                       return true
+               }
+               break
+       }
+       // match: (ADDF z (NEGF (MULF x y)))
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FNMSUBF x y z)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       z := v_0
+                       if v_1.Op != OpLOONG64NEGF {
+                               continue
+                       }
+                       v_1_0 := v_1.Args[0]
+                       if v_1_0.Op != OpLOONG64MULF {
+                               continue
+                       }
+                       y := v_1_0.Args[1]
+                       x := v_1_0.Args[0]
+                       if !(z.Block.Func.useFMA(v)) {
+                               continue
+                       }
+                       v.reset(OpLOONG64FNMSUBF)
+                       v.AddArg3(x, y, z)
+                       return true
+               }
+               break
+       }
+       return false
+}
 func rewriteValueLOONG64_OpLOONG64ADDV(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
@@ -5944,6 +6053,168 @@ func rewriteValueLOONG64_OpLOONG64SRLVconst(v *Value) bool {
        }
        return false
 }
+func rewriteValueLOONG64_OpLOONG64SUBD(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (SUBD (MULD x y) z)
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FMSUBD x y z)
+       for {
+               if v_0.Op != OpLOONG64MULD {
+                       break
+               }
+               y := v_0.Args[1]
+               x := v_0.Args[0]
+               z := v_1
+               if !(z.Block.Func.useFMA(v)) {
+                       break
+               }
+               v.reset(OpLOONG64FMSUBD)
+               v.AddArg3(x, y, z)
+               return true
+       }
+       // match: (SUBD z (MULD x y))
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FNMSUBD x y z)
+       for {
+               z := v_0
+               if v_1.Op != OpLOONG64MULD {
+                       break
+               }
+               y := v_1.Args[1]
+               x := v_1.Args[0]
+               if !(z.Block.Func.useFMA(v)) {
+                       break
+               }
+               v.reset(OpLOONG64FNMSUBD)
+               v.AddArg3(x, y, z)
+               return true
+       }
+       // match: (SUBD z (NEGD (MULD x y)))
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FMADDD x y z)
+       for {
+               z := v_0
+               if v_1.Op != OpLOONG64NEGD {
+                       break
+               }
+               v_1_0 := v_1.Args[0]
+               if v_1_0.Op != OpLOONG64MULD {
+                       break
+               }
+               y := v_1_0.Args[1]
+               x := v_1_0.Args[0]
+               if !(z.Block.Func.useFMA(v)) {
+                       break
+               }
+               v.reset(OpLOONG64FMADDD)
+               v.AddArg3(x, y, z)
+               return true
+       }
+       // match: (SUBD (NEGD (MULD x y)) z)
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FNMADDD x y z)
+       for {
+               if v_0.Op != OpLOONG64NEGD {
+                       break
+               }
+               v_0_0 := v_0.Args[0]
+               if v_0_0.Op != OpLOONG64MULD {
+                       break
+               }
+               y := v_0_0.Args[1]
+               x := v_0_0.Args[0]
+               z := v_1
+               if !(z.Block.Func.useFMA(v)) {
+                       break
+               }
+               v.reset(OpLOONG64FNMADDD)
+               v.AddArg3(x, y, z)
+               return true
+       }
+       return false
+}
+func rewriteValueLOONG64_OpLOONG64SUBF(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (SUBF (MULF x y) z)
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FMSUBF x y z)
+       for {
+               if v_0.Op != OpLOONG64MULF {
+                       break
+               }
+               y := v_0.Args[1]
+               x := v_0.Args[0]
+               z := v_1
+               if !(z.Block.Func.useFMA(v)) {
+                       break
+               }
+               v.reset(OpLOONG64FMSUBF)
+               v.AddArg3(x, y, z)
+               return true
+       }
+       // match: (SUBF z (MULF x y))
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FNMSUBF x y z)
+       for {
+               z := v_0
+               if v_1.Op != OpLOONG64MULF {
+                       break
+               }
+               y := v_1.Args[1]
+               x := v_1.Args[0]
+               if !(z.Block.Func.useFMA(v)) {
+                       break
+               }
+               v.reset(OpLOONG64FNMSUBF)
+               v.AddArg3(x, y, z)
+               return true
+       }
+       // match: (SUBF z (NEGF (MULF x y)))
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FMADDF x y z)
+       for {
+               z := v_0
+               if v_1.Op != OpLOONG64NEGF {
+                       break
+               }
+               v_1_0 := v_1.Args[0]
+               if v_1_0.Op != OpLOONG64MULF {
+                       break
+               }
+               y := v_1_0.Args[1]
+               x := v_1_0.Args[0]
+               if !(z.Block.Func.useFMA(v)) {
+                       break
+               }
+               v.reset(OpLOONG64FMADDF)
+               v.AddArg3(x, y, z)
+               return true
+       }
+       // match: (SUBF (NEGF (MULF x y)) z)
+       // cond: z.Block.Func.useFMA(v)
+       // result: (FNMADDF x y z)
+       for {
+               if v_0.Op != OpLOONG64NEGF {
+                       break
+               }
+               v_0_0 := v_0.Args[0]
+               if v_0_0.Op != OpLOONG64MULF {
+                       break
+               }
+               y := v_0_0.Args[1]
+               x := v_0_0.Args[0]
+               z := v_1
+               if !(z.Block.Func.useFMA(v)) {
+                       break
+               }
+               v.reset(OpLOONG64FNMADDF)
+               v.AddArg3(x, y, z)
+               return true
+       }
+       return false
+}
 func rewriteValueLOONG64_OpLOONG64SUBV(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
index fda273b3e561f09c0cc43172c51e584d01b37a23..a1d962ee3ad50475ecca78016bd2432b01535784 100644 (file)
@@ -689,7 +689,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
                },
-               sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X)
+               sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
        addF("math", "FMA",
                func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
                        if !s.config.UseFMA {
index 4e59714ce7fe22782e5c3198389eccf6400d4f6c..9cf8cbc8772001c925162ee334a9b1866d8b7a58 100644 (file)
@@ -399,6 +399,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
        {"loong64", "internal/runtime/sys", "Len8"}:                struct{}{},
        {"loong64", "math", "Abs"}:                                 struct{}{},
        {"loong64", "math", "Copysign"}:                            struct{}{},
+       {"loong64", "math", "FMA"}:                                 struct{}{},
        {"loong64", "math", "sqrt"}:                                struct{}{},
        {"loong64", "math/big", "mulWW"}:                           struct{}{},
        {"loong64", "math/bits", "Add"}:                            struct{}{},
index a77843d0e733a48766ab079cd0d8d26ef9faf436..1b85eba35249c95e09bb94029f01e6f7cf20c693 100644 (file)
@@ -72,6 +72,7 @@ func FusedAdd32(x, y, z float32) float32 {
        // s390x:"FMADDS\t"
        // ppc64x:"FMADDS\t"
        // arm64:"FMADDS"
+       // loong64:"FMADDF\t"
        // riscv64:"FMADDS\t"
        return x*y + z
 }
@@ -80,11 +81,13 @@ func FusedSub32_a(x, y, z float32) float32 {
        // s390x:"FMSUBS\t"
        // ppc64x:"FMSUBS\t"
        // riscv64:"FMSUBS\t"
+       // loong64:"FMSUBF\t"
        return x*y - z
 }
 
 func FusedSub32_b(x, y, z float32) float32 {
        // arm64:"FMSUBS"
+       // loong64:"FNMSUBF\t"
        // riscv64:"FNMSUBS\t"
        return z - x*y
 }
@@ -93,6 +96,7 @@ func FusedAdd64(x, y, z float64) float64 {
        // s390x:"FMADD\t"
        // ppc64x:"FMADD\t"
        // arm64:"FMADDD"
+       // loong64:"FMADDD\t"
        // riscv64:"FMADDD\t"
        return x*y + z
 }
@@ -101,11 +105,13 @@ func FusedSub64_a(x, y, z float64) float64 {
        // s390x:"FMSUB\t"
        // ppc64x:"FMSUB\t"
        // riscv64:"FMSUBD\t"
+       // loong64:"FMSUBD\t"
        return x*y - z
 }
 
 func FusedSub64_b(x, y, z float64) float64 {
        // arm64:"FMSUBD"
+       // loong64:"FNMSUBD\t"
        // riscv64:"FNMSUBD\t"
        return z - x*y
 }
index 806f9096484bb0e46303e68fc2e86529e92f82f7..4ce5fa419d2b79e84cd12bbbbd83db78c9615733 100644 (file)
@@ -132,6 +132,7 @@ func fma(x, y, z float64) float64 {
        // amd64:"VFMADD231SD"
        // arm/6:"FMULAD"
        // arm64:"FMADDD"
+       // loong64:"FMADDD"
        // s390x:"FMADD"
        // ppc64x:"FMADD"
        // riscv64:"FMADDD"